From 9a812198ae49967f239789164c55ec3e72b7e0dd Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Thu, 14 Aug 2008 14:08:44 +0200
Subject: IPVS: Add genetlink interface implementation

Add the implementation of the new Generic Netlink interface to IPVS and
keep the old set/getsockopt interface for userspace backwards
compatibility.

Signed-off-by: Julius Volz <juliusv@google.com>
Acked-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 875 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 875 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 6379705a8dc..d1dbd8b311b 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -37,6 +37,7 @@
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/sock.h>
+#include <net/genetlink.h>
 
 #include <asm/uaccess.h>
 
@@ -2320,6 +2321,872 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
 	.owner		= THIS_MODULE,
 };
 
+/*
+ * Generic Netlink interface
+ */
+
+/* IPVS genetlink family */
+static struct genl_family ip_vs_genl_family = {
+	.id		= GENL_ID_GENERATE,
+	.hdrsize	= 0,
+	.name		= IPVS_GENL_NAME,
+	.version	= IPVS_GENL_VERSION,
+	.maxattr	= IPVS_CMD_MAX,
+};
+
+/* Policy used for first-level command attributes */
+static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
+	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
+	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
+	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
+static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
+	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
+	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_IFNAME_MAXLEN },
+	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
+static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
+	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
+					    .len = sizeof(union nf_inet_addr) },
+	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_SCHEDNAME_MAXLEN },
+	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
+					    .len = sizeof(struct ip_vs_flags) },
+	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
+static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
+	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
+					    .len = sizeof(union nf_inet_addr) },
+	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
+	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
+};
+
+static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
+				 struct ip_vs_stats *stats)
+{
+	struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+	if (!nl_stats)
+		return -EMSGSIZE;
+
+	spin_lock_bh(&stats->lock);
+
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->conns);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->cps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->inpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->outpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->inbps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->outbps);
+
+	spin_unlock_bh(&stats->lock);
+
+	nla_nest_end(skb, nl_stats);
+
+	return 0;
+
+nla_put_failure:
+	spin_unlock_bh(&stats->lock);
+	nla_nest_cancel(skb, nl_stats);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc)
+{
+	struct nlattr *nl_service;
+	struct ip_vs_flags flags = { .flags = svc->flags,
+				     .mask = ~0 };
+
+	nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+	if (!nl_service)
+		return -EMSGSIZE;
+
+	NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, AF_INET);
+
+	if (svc->fwmark) {
+		NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
+	} else {
+		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
+		NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
+		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
+	}
+
+	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
+	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
+	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
+
+	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nl_service);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_service);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc,
+				   struct netlink_callback *cb)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_SERVICE);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_service(skb, svc) < 0)
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	int idx = 0, i;
+	int start = cb->args[0];
+	struct ip_vs_service *svc;
+
+	mutex_lock(&__ip_vs_mutex);
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+			if (++idx <= start)
+				continue;
+			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+				idx--;
+				goto nla_put_failure;
+			}
+		}
+	}
+
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+			if (++idx <= start)
+				continue;
+			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+				idx--;
+				goto nla_put_failure;
+			}
+		}
+	}
+
+nla_put_failure:
+	mutex_unlock(&__ip_vs_mutex);
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc,
+				    struct nlattr *nla, int full_entry)
+{
+	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
+	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+
+	/* Parse mandatory identifying service fields first */
+	if (nla == NULL ||
+	    nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
+		return -EINVAL;
+
+	nla_af		= attrs[IPVS_SVC_ATTR_AF];
+	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
+	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
+	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
+	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];
+
+	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
+		return -EINVAL;
+
+	/* For now, only support IPv4 */
+	if (nla_get_u16(nla_af) != AF_INET)
+		return -EAFNOSUPPORT;
+
+	if (nla_fwmark) {
+		usvc->protocol = IPPROTO_TCP;
+		usvc->fwmark = nla_get_u32(nla_fwmark);
+	} else {
+		usvc->protocol = nla_get_u16(nla_protocol);
+		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
+		usvc->port = nla_get_u16(nla_port);
+		usvc->fwmark = 0;
+	}
+
+	/* If a full entry was requested, check for the additional fields */
+	if (full_entry) {
+		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+			      *nla_netmask;
+		struct ip_vs_flags flags;
+		struct ip_vs_service *svc;
+
+		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
+		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
+		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
+
+		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
+			return -EINVAL;
+
+		nla_memcpy(&flags, nla_flags, sizeof(flags));
+
+		/* prefill flags from service if it already exists */
+		if (usvc->fwmark)
+			svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+		else
+			svc = __ip_vs_service_get(usvc->protocol, usvc->addr,
+						  usvc->port);
+		if (svc) {
+			usvc->flags = svc->flags;
+			ip_vs_service_put(svc);
+		} else
+			usvc->flags = 0;
+
+		/* set new flags from userland */
+		usvc->flags = (usvc->flags & ~flags.mask) |
+			      (flags.flags & flags.mask);
+
+		strlcpy(usvc->sched_name, nla_data(nla_sched),
+			sizeof(usvc->sched_name));
+		usvc->timeout = nla_get_u32(nla_timeout);
+		usvc->netmask = nla_get_u32(nla_netmask);
+	}
+
+	return 0;
+}
+
+static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+{
+	struct ip_vs_service_user usvc;
+	int ret;
+
+	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (usvc.fwmark)
+		return __ip_vs_svc_fwm_get(usvc.fwmark);
+	else
+		return __ip_vs_service_get(usvc.protocol, usvc.addr,
+					   usvc.port);
+}
+
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+{
+	struct nlattr *nl_dest;
+
+	nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+	if (!nl_dest)
+		return -EMSGSIZE;
+
+	NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
+	NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
+
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+		    atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+		    atomic_read(&dest->activeconns));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+		    atomic_read(&dest->inactconns));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+		    atomic_read(&dest->persistconns));
+
+	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nl_dest);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_dest);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
+				struct netlink_callback *cb)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_DEST);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_dest(skb, dest) < 0)
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dests(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	int idx = 0;
+	int start = cb->args[0];
+	struct ip_vs_service *svc;
+	struct ip_vs_dest *dest;
+	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+
+	mutex_lock(&__ip_vs_mutex);
+
+	/* Try to find the service for which to dump destinations */
+	if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
+			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
+		goto out_err;
+
+	svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+	if (IS_ERR(svc) || svc == NULL)
+		goto out_err;
+
+	/* Dump the destinations */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (++idx <= start)
+			continue;
+		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+			idx--;
+			goto nla_put_failure;
+		}
+	}
+
+nla_put_failure:
+	cb->args[0] = idx;
+	ip_vs_service_put(svc);
+
+out_err:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user *udest,
+				 struct nlattr *nla, int full_entry)
+{
+	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
+	struct nlattr *nla_addr, *nla_port;
+
+	/* Parse mandatory identifying destination fields first */
+	if (nla == NULL ||
+	    nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
+		return -EINVAL;
+
+	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
+	nla_port	= attrs[IPVS_DEST_ATTR_PORT];
+
+	if (!(nla_addr && nla_port))
+		return -EINVAL;
+
+	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
+	udest->port = nla_get_u16(nla_port);
+
+	/* If a full entry was requested, check for the additional fields */
+	if (full_entry) {
+		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
+			      *nla_l_thresh;
+
+		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
+		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
+		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
+		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];
+
+		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
+			return -EINVAL;
+
+		udest->conn_flags = nla_get_u32(nla_fwd)
+				    & IP_VS_CONN_F_FWD_MASK;
+		udest->weight = nla_get_u32(nla_weight);
+		udest->u_threshold = nla_get_u32(nla_u_thresh);
+		udest->l_threshold = nla_get_u32(nla_l_thresh);
+	}
+
+	return 0;
+}
+
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
+				  const char *mcast_ifn, __be32 syncid)
+{
+	struct nlattr *nl_daemon;
+
+	nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+	if (!nl_daemon)
+		return -EMSGSIZE;
+
+	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
+	NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
+	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
+
+	nla_nest_end(skb, nl_daemon);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_daemon);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
+				  const char *mcast_ifn, __be32 syncid,
+				  struct netlink_callback *cb)
+{
+	void *hdr;
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_DAEMON);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	mutex_lock(&__ip_vs_mutex);
+	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
+					   ip_vs_master_mcast_ifn,
+					   ip_vs_master_syncid, cb) < 0)
+			goto nla_put_failure;
+
+		cb->args[0] = 1;
+	}
+
+	if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
+					   ip_vs_backup_mcast_ifn,
+					   ip_vs_backup_syncid, cb) < 0)
+			goto nla_put_failure;
+
+		cb->args[1] = 1;
+	}
+
+nla_put_failure:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
+static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+{
+	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
+	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
+	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
+		return -EINVAL;
+
+	return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+}
+
+static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+{
+	if (!attrs[IPVS_DAEMON_ATTR_STATE])
+		return -EINVAL;
+
+	return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+}
+
+static int ip_vs_genl_set_config(struct nlattr **attrs)
+{
+	struct ip_vs_timeout_user t;
+
+	__ip_vs_get_timeouts(&t);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
+		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
+		t.tcp_fin_timeout =
+			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
+		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
+
+	return ip_vs_set_timeout(&t);
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ip_vs_service *svc = NULL;
+	struct ip_vs_service_user usvc;
+	struct ip_vs_dest_user udest;
+	int ret = 0, cmd;
+	int need_full_svc = 0, need_full_dest = 0;
+
+	cmd = info->genlhdr->cmd;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	if (cmd == IPVS_CMD_FLUSH) {
+		ret = ip_vs_flush();
+		goto out;
+	} else if (cmd == IPVS_CMD_SET_CONFIG) {
+		ret = ip_vs_genl_set_config(info->attrs);
+		goto out;
+	} else if (cmd == IPVS_CMD_NEW_DAEMON ||
+		   cmd == IPVS_CMD_DEL_DAEMON) {
+
+		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+
+		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
+		    nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
+				     info->attrs[IPVS_CMD_ATTR_DAEMON],
+				     ip_vs_daemon_policy)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (cmd == IPVS_CMD_NEW_DAEMON)
+			ret = ip_vs_genl_new_daemon(daemon_attrs);
+		else
+			ret = ip_vs_genl_del_daemon(daemon_attrs);
+		goto out;
+	} else if (cmd == IPVS_CMD_ZERO &&
+		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
+		ret = ip_vs_zero_all();
+		goto out;
+	}
+
+	/* All following commands require a service argument, so check if we
+	 * received a valid one. We need a full service specification when
+	 * adding / editing a service. Only identifying members otherwise. */
+	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
+		need_full_svc = 1;
+
+	ret = ip_vs_genl_parse_service(&usvc,
+				       info->attrs[IPVS_CMD_ATTR_SERVICE],
+				       need_full_svc);
+	if (ret)
+		goto out;
+
+	/* Lookup the exact service by <protocol, addr, port> or fwmark */
+	if (usvc.fwmark == 0)
+		svc = __ip_vs_service_get(usvc.protocol, usvc.addr, usvc.port);
+	else
+		svc = __ip_vs_svc_fwm_get(usvc.fwmark);
+
+	/* Unless we're adding a new service, the service must already exist */
+	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
+		ret = -ESRCH;
+		goto out;
+	}
+
+	/* Destination commands require a valid destination argument. For
+	 * adding / editing a destination, we need a full destination
+	 * specification. */
+	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
+	    cmd == IPVS_CMD_DEL_DEST) {
+		if (cmd != IPVS_CMD_DEL_DEST)
+			need_full_dest = 1;
+
+		ret = ip_vs_genl_parse_dest(&udest,
+					    info->attrs[IPVS_CMD_ATTR_DEST],
+					    need_full_dest);
+		if (ret)
+			goto out;
+	}
+
+	switch (cmd) {
+	case IPVS_CMD_NEW_SERVICE:
+		if (svc == NULL)
+			ret = ip_vs_add_service(&usvc, &svc);
+		else
+			ret = -EEXIST;
+		break;
+	case IPVS_CMD_SET_SERVICE:
+		ret = ip_vs_edit_service(svc, &usvc);
+		break;
+	case IPVS_CMD_DEL_SERVICE:
+		ret = ip_vs_del_service(svc);
+		break;
+	case IPVS_CMD_NEW_DEST:
+		ret = ip_vs_add_dest(svc, &udest);
+		break;
+	case IPVS_CMD_SET_DEST:
+		ret = ip_vs_edit_dest(svc, &udest);
+		break;
+	case IPVS_CMD_DEL_DEST:
+		ret = ip_vs_del_dest(svc, &udest);
+		break;
+	case IPVS_CMD_ZERO:
+		ret = ip_vs_zero_service(svc);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+out:
+	if (svc)
+		ip_vs_service_put(svc);
+	mutex_unlock(&__ip_vs_mutex);
+
+	return ret;
+}
+
+static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	void *reply;
+	int ret, cmd, reply_cmd;
+
+	cmd = info->genlhdr->cmd;
+
+	if (cmd == IPVS_CMD_GET_SERVICE)
+		reply_cmd = IPVS_CMD_NEW_SERVICE;
+	else if (cmd == IPVS_CMD_GET_INFO)
+		reply_cmd = IPVS_CMD_SET_INFO;
+	else if (cmd == IPVS_CMD_GET_CONFIG)
+		reply_cmd = IPVS_CMD_SET_CONFIG;
+	else {
+		IP_VS_ERR("unknown Generic Netlink command\n");
+		return -EINVAL;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
+	if (reply == NULL)
+		goto nla_put_failure;
+
+	switch (cmd) {
+	case IPVS_CMD_GET_SERVICE:
+	{
+		struct ip_vs_service *svc;
+
+		svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+		if (IS_ERR(svc)) {
+			ret = PTR_ERR(svc);
+			goto out_err;
+		} else if (svc) {
+			ret = ip_vs_genl_fill_service(msg, svc);
+			ip_vs_service_put(svc);
+			if (ret)
+				goto nla_put_failure;
+		} else {
+			ret = -ESRCH;
+			goto out_err;
+		}
+
+		break;
+	}
+
+	case IPVS_CMD_GET_CONFIG:
+	{
+		struct ip_vs_timeout_user t;
+
+		__ip_vs_get_timeouts(&t);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+			    t.tcp_fin_timeout);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
+#endif
+
+		break;
+	}
+
+	case IPVS_CMD_GET_INFO:
+		NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
+		NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+			    IP_VS_CONN_TAB_SIZE);
+		break;
+	}
+
+	genlmsg_end(msg, reply);
+	ret = genlmsg_unicast(msg, info->snd_pid);
+	goto out;
+
+nla_put_failure:
+	IP_VS_ERR("not enough space in Netlink message\n");
+	ret = -EMSGSIZE;
+
+out_err:
+	nlmsg_free(msg);
+out:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return ret;
+}
+
+
+static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+	{
+		.cmd	= IPVS_CMD_NEW_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+		.dumpit	= ip_vs_genl_dump_services,
+		.policy	= ip_vs_cmd_policy,
+	},
+	{
+		.cmd	= IPVS_CMD_NEW_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.dumpit	= ip_vs_genl_dump_dests,
+	},
+	{
+		.cmd	= IPVS_CMD_NEW_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.dumpit	= ip_vs_genl_dump_daemons,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_CONFIG,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_CONFIG,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_INFO,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_ZERO,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_FLUSH,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+};
+
+static int __init ip_vs_genl_register(void)
+{
+	int ret, i;
+
+	ret = genl_register_family(&ip_vs_genl_family);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
+		ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
+		if (ret)
+			goto err_out;
+	}
+	return 0;
+
+err_out:
+	genl_unregister_family(&ip_vs_genl_family);
+	return ret;
+}
+
+static void ip_vs_genl_unregister(void)
+{
+	genl_unregister_family(&ip_vs_genl_family);
+}
+
+/* End of Generic Netlink interface definitions */
+
 
 int __init ip_vs_control_init(void)
 {
@@ -2334,6 +3201,13 @@ int __init ip_vs_control_init(void)
 		return ret;
 	}
 
+	ret = ip_vs_genl_register();
+	if (ret) {
+		IP_VS_ERR("cannot register Generic Netlink interface.\n");
+		nf_unregister_sockopt(&ip_vs_sockopts);
+		return ret;
+	}
+
 	proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
 	proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
 
@@ -2368,6 +3242,7 @@ void ip_vs_control_cleanup(void)
 	unregister_sysctl_table(sysctl_header);
 	proc_net_remove(&init_net, "ip_vs_stats");
 	proc_net_remove(&init_net, "ip_vs");
+	ip_vs_genl_unregister();
 	nf_unregister_sockopt(&ip_vs_sockopts);
 	LeaveFunction(2);
 }
-- 
cgit v1.2.3


From 82dfb6f32219d8e6cf6b979a520cb2b11d977d4e Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Mon, 11 Aug 2008 19:36:06 +0000
Subject: ipvs: Only call init_service, update_service and done_service for
 schedulers if defined

There are schedulers that only schedule based on data available in the service
or destination structures and they don't need any persistent storage or
initialization routine. These schedulers currently provide dummy functions for
the init_service, update_service and/or done_service functions. For the
init_service and done_service cases we already have code that only calls these
functions, if the scheduler provides them. Do the same for the update_service
case and remove the dummy functions from all schedulers.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c   | 21 ++++++++++++---------
 net/ipv4/ipvs/ip_vs_lblc.c  |  7 -------
 net/ipv4/ipvs/ip_vs_lblcr.c |  7 -------
 net/ipv4/ipvs/ip_vs_lc.c    | 21 ---------------------
 net/ipv4/ipvs/ip_vs_nq.c    | 24 ------------------------
 net/ipv4/ipvs/ip_vs_rr.c    |  7 -------
 net/ipv4/ipvs/ip_vs_sed.c   | 24 ------------------------
 net/ipv4/ipvs/ip_vs_wlc.c   | 24 ------------------------
 8 files changed, 12 insertions(+), 123 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index d1dbd8b311b..ede101eeec1 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -869,7 +869,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 		svc->num_dests++;
 
 		/* call the update_service function of its scheduler */
-		svc->scheduler->update_service(svc);
+		if (svc->scheduler->update_service)
+			svc->scheduler->update_service(svc);
 
 		write_unlock_bh(&__ip_vs_svc_lock);
 		return 0;
@@ -899,7 +900,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 	svc->num_dests++;
 
 	/* call the update_service function of its scheduler */
-	svc->scheduler->update_service(svc);
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
 
 	write_unlock_bh(&__ip_vs_svc_lock);
 
@@ -949,7 +951,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
 
 	/* call the update_service, because server weight may be changed */
-	svc->scheduler->update_service(svc);
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
 
 	write_unlock_bh(&__ip_vs_svc_lock);
 
@@ -1012,12 +1015,12 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
 	 */
 	list_del(&dest->n_list);
 	svc->num_dests--;
-	if (svcupd) {
-		/*
-		 *  Call the update_service function of its scheduler
-		 */
-		svc->scheduler->update_service(svc);
-	}
+
+	/*
+	 *  Call the update_service function of its scheduler
+	 */
+	if (svcupd && svc->scheduler->update_service)
+			svc->scheduler->update_service(svc);
 }
 
 
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 7a6a319f544..4a14d069f8a 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -388,12 +388,6 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
 static inline struct ip_vs_dest *
 __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
 {
@@ -542,7 +536,6 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
 	.n_list =		LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
 	.init_service =		ip_vs_lblc_init_svc,
 	.done_service =		ip_vs_lblc_done_svc,
-	.update_service =	ip_vs_lblc_update_svc,
 	.schedule =		ip_vs_lblc_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index c234e73968a..46b870385b8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -572,12 +572,6 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
 static inline struct ip_vs_dest *
 __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
 {
@@ -731,7 +725,6 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
 	.n_list =		LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
 	.init_service =		ip_vs_lblcr_init_svc,
 	.done_service =		ip_vs_lblcr_done_svc,
-	.update_service =	ip_vs_lblcr_update_svc,
 	.schedule =		ip_vs_lblcr_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
index ebcdbf75ac6..2c3de1b6351 100644
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -20,24 +20,6 @@
 #include <net/ip_vs.h>
 
 
-static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
 static inline unsigned int
 ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
 {
@@ -99,9 +81,6 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = {
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
-	.init_service =		ip_vs_lc_init_svc,
-	.done_service =		ip_vs_lc_done_svc,
-	.update_service =	ip_vs_lc_update_svc,
 	.schedule =		ip_vs_lc_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
index 92f3a677003..5330d5a2de1 100644
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -37,27 +37,6 @@
 #include <net/ip_vs.h>
 
 
-static int
-ip_vs_nq_init_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int
-ip_vs_nq_done_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int
-ip_vs_nq_update_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
 static inline unsigned int
 ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
 {
@@ -137,9 +116,6 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
-	.init_service =		ip_vs_nq_init_svc,
-	.done_service =		ip_vs_nq_done_svc,
-	.update_service =	ip_vs_nq_update_svc,
 	.schedule =		ip_vs_nq_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
index 358110d17e5..f7492911753 100644
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -32,12 +32,6 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
 static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
 {
 	svc->sched_data = &svc->destinations;
@@ -96,7 +90,6 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
 	.init_service =		ip_vs_rr_init_svc,
-	.done_service =		ip_vs_rr_done_svc,
 	.update_service =	ip_vs_rr_update_svc,
 	.schedule =		ip_vs_rr_schedule,
 };
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
index 77663d84cbd..53f73bea66c 100644
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -41,27 +41,6 @@
 #include <net/ip_vs.h>
 
 
-static int
-ip_vs_sed_init_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int
-ip_vs_sed_done_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int
-ip_vs_sed_update_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
 static inline unsigned int
 ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
 {
@@ -139,9 +118,6 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
-	.init_service =		ip_vs_sed_init_svc,
-	.done_service =		ip_vs_sed_done_svc,
-	.update_service =	ip_vs_sed_update_svc,
 	.schedule =		ip_vs_sed_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
index 9b0ef86bb1f..df7ad8d7476 100644
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -25,27 +25,6 @@
 #include <net/ip_vs.h>
 
 
-static int
-ip_vs_wlc_init_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int
-ip_vs_wlc_done_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
-static int
-ip_vs_wlc_update_svc(struct ip_vs_service *svc)
-{
-	return 0;
-}
-
-
 static inline unsigned int
 ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
 {
@@ -127,9 +106,6 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
-	.init_service =		ip_vs_wlc_init_svc,
-	.done_service =		ip_vs_wlc_done_svc,
-	.update_service =	ip_vs_wlc_update_svc,
 	.schedule =		ip_vs_wlc_schedule,
 };
 
-- 
cgit v1.2.3


From a919cf4b6b499416b6e2247dbc79196c4325f2e6 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Thu, 14 Aug 2008 00:47:16 +0200
Subject: ipvs: Create init functions for estimator code

Commit 8ab19ea36c5c5340ff598e4d15fc084eb65671dc ("ipvs: Fix possible deadlock
in estimator code") fixed a deadlock condition, but that condition can only
happen during unload of IPVS, because during normal operation there is at least
our global stats structure in the estimator list. The mod_timer() and
del_timer_sync() calls are actually initialization and cleanup code in
disguise. Let's make it explicit and move them to their own init and cleanup
function.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c |  8 ++++++--
 net/ipv4/ipvs/ip_vs_est.c  | 18 +++++++++++-------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index a7879eafc3b..9fbf0a6d739 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -1070,10 +1070,12 @@ static int __init ip_vs_init(void)
 {
 	int ret;
 
+	ip_vs_estimator_init();
+
 	ret = ip_vs_control_init();
 	if (ret < 0) {
 		IP_VS_ERR("can't setup control.\n");
-		goto cleanup_nothing;
+		goto cleanup_estimator;
 	}
 
 	ip_vs_protocol_init();
@@ -1106,7 +1108,8 @@ static int __init ip_vs_init(void)
   cleanup_protocol:
 	ip_vs_protocol_cleanup();
 	ip_vs_control_cleanup();
-  cleanup_nothing:
+  cleanup_estimator:
+	ip_vs_estimator_cleanup();
 	return ret;
 }
 
@@ -1117,6 +1120,7 @@ static void __exit ip_vs_cleanup(void)
 	ip_vs_app_cleanup();
 	ip_vs_protocol_cleanup();
 	ip_vs_control_cleanup();
+	ip_vs_estimator_cleanup();
 	IP_VS_INFO("ipvs unloaded.\n");
 }
 
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 5a20f93bd7f..4fb620ec208 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -124,8 +124,6 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
 	est->outbps = stats->outbps<<5;
 
 	spin_lock_bh(&est_lock);
-	if (list_empty(&est_list))
-		mod_timer(&est_timer, jiffies + 2 * HZ);
 	list_add(&est->list, &est_list);
 	spin_unlock_bh(&est_lock);
 }
@@ -136,11 +134,6 @@ void ip_vs_kill_estimator(struct ip_vs_stats *stats)
 
 	spin_lock_bh(&est_lock);
 	list_del(&est->list);
-	while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) {
-		spin_unlock_bh(&est_lock);
-		cpu_relax();
-		spin_lock_bh(&est_lock);
-	}
 	spin_unlock_bh(&est_lock);
 }
 
@@ -160,3 +153,14 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
 	est->inbps = 0;
 	est->outbps = 0;
 }
+
+int __init ip_vs_estimator_init(void)
+{
+	mod_timer(&est_timer, jiffies + 2 * HZ);
+	return 0;
+}
+
+void ip_vs_estimator_cleanup(void)
+{
+	del_timer_sync(&est_timer);
+}
-- 
cgit v1.2.3


From 4a031b0e6acd8a8c23725ceb5db6a0aa5c4e231f Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Fri, 15 Aug 2008 09:26:15 +1000
Subject: ipvs: rename __ip_vs_wlc_schedule in lblc and lblcr schedulers

For the sake of clarity, rename __ip_vs_wlc_schedule() in lblc.c to
__ip_vs_lblc_schedule() and the version in lblcr.c to __ip_vs_lblc_schedule().

I guess the original name stuck from a copy and paste.

Cc: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_lblc.c  | 6 +++---
 net/ipv4/ipvs/ip_vs_lblcr.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 4a14d069f8a..b9b334cccf3 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -389,7 +389,7 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 
 
 static inline struct ip_vs_dest *
-__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
 {
 	struct ip_vs_dest *dest, *least;
 	int loh, doh;
@@ -488,7 +488,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	tbl = (struct ip_vs_lblc_table *)svc->sched_data;
 	en = ip_vs_lblc_get(tbl, iph->daddr);
 	if (en == NULL) {
-		dest = __ip_vs_wlc_schedule(svc, iph);
+		dest = __ip_vs_lblc_schedule(svc, iph);
 		if (dest == NULL) {
 			IP_VS_DBG(1, "no destination available\n");
 			return NULL;
@@ -503,7 +503,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
 		    || atomic_read(&dest->weight) <= 0
 		    || is_overloaded(dest, svc)) {
-			dest = __ip_vs_wlc_schedule(svc, iph);
+			dest = __ip_vs_lblc_schedule(svc, iph);
 			if (dest == NULL) {
 				IP_VS_DBG(1, "no destination available\n");
 				return NULL;
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 46b870385b8..f1c84503689 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -573,7 +573,7 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 
 
 static inline struct ip_vs_dest *
-__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
 {
 	struct ip_vs_dest *dest, *least;
 	int loh, doh;
@@ -673,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
 	en = ip_vs_lblcr_get(tbl, iph->daddr);
 	if (en == NULL) {
-		dest = __ip_vs_wlc_schedule(svc, iph);
+		dest = __ip_vs_lblcr_schedule(svc, iph);
 		if (dest == NULL) {
 			IP_VS_DBG(1, "no destination available\n");
 			return NULL;
@@ -687,7 +687,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	} else {
 		dest = ip_vs_dest_set_min(&en->set);
 		if (!dest || is_overloaded(dest, svc)) {
-			dest = __ip_vs_wlc_schedule(svc, iph);
+			dest = __ip_vs_lblcr_schedule(svc, iph);
 			if (dest == NULL) {
 				IP_VS_DBG(1, "no destination available\n");
 				return NULL;
-- 
cgit v1.2.3


From 39ac50d0c79747b186c1268d9a488f8c1d256be7 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Mon, 18 Aug 2008 00:52:08 +0200
Subject: ipvs: Fix race conditions in lblc scheduler

We can't access the cache entry outside of our critical read-locked region,
because someone may free that entry. And we also need to check in the critical
region wether the destination is still available, i.e. it's not in the trash.
If we drop our reference counter, the destination can be purged from the trash
at any time. Our caller only guarantees that no destination is moved to the
trash, while we are scheduling. Also there is no need for our own rwlock,
there is already one in the service structure for use in the schedulers.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_lblc.c | 204 +++++++++++++++++++++------------------------
 1 file changed, 96 insertions(+), 108 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index b9b334cccf3..d2a43aa3fe4 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -96,7 +96,6 @@ struct ip_vs_lblc_entry {
  *      IPVS lblc hash table
  */
 struct ip_vs_lblc_table {
-	rwlock_t	        lock;           /* lock for this table */
 	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
 	atomic_t                entries;        /* number of entries */
 	int                     max_size;       /* maximum size of entries */
@@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = {
 
 static struct ctl_table_header * sysctl_header;
 
-/*
- *      new/free a ip_vs_lblc_entry, which is a mapping of a destionation
- *      IP address to a server.
- */
-static inline struct ip_vs_lblc_entry *
-ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
-{
-	struct ip_vs_lblc_entry *en;
-
-	en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
-	if (en == NULL) {
-		IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
-		return NULL;
-	}
-
-	INIT_LIST_HEAD(&en->list);
-	en->addr = daddr;
-
-	atomic_inc(&dest->refcnt);
-	en->dest = dest;
-
-	return en;
-}
-
-
 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
 {
 	list_del(&en->list);
@@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
  *	Hash an entry in the ip_vs_lblc_table.
  *	returns bool success.
  */
-static int
+static void
 ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
 {
-	unsigned hash;
-
-	if (!list_empty(&en->list)) {
-		IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
-			  "called from %p\n", __builtin_return_address(0));
-		return 0;
-	}
+	unsigned hash = ip_vs_lblc_hashkey(en->addr);
 
-	/*
-	 *	Hash by destination IP address
-	 */
-	hash = ip_vs_lblc_hashkey(en->addr);
-
-	write_lock(&tbl->lock);
 	list_add(&en->list, &tbl->bucket[hash]);
 	atomic_inc(&tbl->entries);
-	write_unlock(&tbl->lock);
-
-	return 1;
 }
 
 
 /*
- *  Get ip_vs_lblc_entry associated with supplied parameters.
+ *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read
+ *  lock
  */
 static inline struct ip_vs_lblc_entry *
 ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
 {
-	unsigned hash;
+	unsigned hash = ip_vs_lblc_hashkey(addr);
 	struct ip_vs_lblc_entry *en;
 
-	hash = ip_vs_lblc_hashkey(addr);
+	list_for_each_entry(en, &tbl->bucket[hash], list)
+		if (en->addr == addr)
+			return en;
 
-	read_lock(&tbl->lock);
+	return NULL;
+}
 
-	list_for_each_entry(en, &tbl->bucket[hash], list) {
-		if (en->addr == addr) {
-			/* HIT */
-			read_unlock(&tbl->lock);
-			return en;
+
+/*
+ * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
+ * address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
+	       struct ip_vs_dest *dest)
+{
+	struct ip_vs_lblc_entry *en;
+
+	en = ip_vs_lblc_get(tbl, daddr);
+	if (!en) {
+		en = kmalloc(sizeof(*en), GFP_ATOMIC);
+		if (!en) {
+			IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+			return NULL;
 		}
-	}
 
-	read_unlock(&tbl->lock);
+		en->addr = daddr;
+		en->lastuse = jiffies;
 
-	return NULL;
+		atomic_inc(&dest->refcnt);
+		en->dest = dest;
+
+		ip_vs_lblc_hash(tbl, en);
+	} else if (en->dest != dest) {
+		atomic_dec(&en->dest->refcnt);
+		atomic_inc(&dest->refcnt);
+		en->dest = dest;
+	}
+
+	return en;
 }
 
 
@@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
  */
 static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
 {
-	int i;
 	struct ip_vs_lblc_entry *en, *nxt;
+	int i;
 
 	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
-		write_lock(&tbl->lock);
 		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
 			ip_vs_lblc_free(en);
 			atomic_dec(&tbl->entries);
 		}
-		write_unlock(&tbl->lock);
 	}
 }
 
 
-static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
+static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
 {
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	struct ip_vs_lblc_entry *en, *nxt;
 	unsigned long now = jiffies;
 	int i, j;
-	struct ip_vs_lblc_entry *en, *nxt;
 
 	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
-		write_lock(&tbl->lock);
+		write_lock(&svc->sched_lock);
 		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
 			if (time_before(now,
 					en->lastuse + sysctl_ip_vs_lblc_expiration))
@@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
 			ip_vs_lblc_free(en);
 			atomic_dec(&tbl->entries);
 		}
-		write_unlock(&tbl->lock);
+		write_unlock(&svc->sched_lock);
 	}
 	tbl->rover = j;
 }
@@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
  */
 static void ip_vs_lblc_check_expire(unsigned long data)
 {
-	struct ip_vs_lblc_table *tbl;
+	struct ip_vs_service *svc = (struct ip_vs_service *) data;
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
 	unsigned long now = jiffies;
 	int goal;
 	int i, j;
 	struct ip_vs_lblc_entry *en, *nxt;
 
-	tbl = (struct ip_vs_lblc_table *)data;
-
 	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
 		/* do full expiration check */
-		ip_vs_lblc_full_check(tbl);
+		ip_vs_lblc_full_check(svc);
 		tbl->counter = 1;
 		goto out;
 	}
@@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
 	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
-		write_lock(&tbl->lock);
+		write_lock(&svc->sched_lock);
 		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
 			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
 				continue;
@@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
 			atomic_dec(&tbl->entries);
 			goal--;
 		}
-		write_unlock(&tbl->lock);
+		write_unlock(&svc->sched_lock);
 		if (goal <= 0)
 			break;
 	}
@@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	/*
 	 *    Allocate the ip_vs_lblc_table for this service
 	 */
-	tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
+	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
 	if (tbl == NULL) {
 		IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
 		return -ENOMEM;
 	}
 	svc->sched_data = tbl;
 	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
-		  "current service\n",
-		  sizeof(struct ip_vs_lblc_table));
+		  "current service\n", sizeof(*tbl));
 
 	/*
 	 *    Initialize the hash buckets
@@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
 		INIT_LIST_HEAD(&tbl->bucket[i]);
 	}
-	rwlock_init(&tbl->lock);
 	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
@@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	 *    Hook periodic timer for garbage collection
 	 */
 	setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
-			(unsigned long)tbl);
-	tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
-	add_timer(&tbl->periodic_timer);
+			(unsigned long)svc);
+	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
 
 	return 0;
 }
@@ -380,9 +360,9 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 	ip_vs_lblc_flush(tbl);
 
 	/* release the table itself */
-	kfree(svc->sched_data);
+	kfree(tbl);
 	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
-		  sizeof(struct ip_vs_lblc_table));
+		  sizeof(*tbl));
 
 	return 0;
 }
@@ -478,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 static struct ip_vs_dest *
 ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
-	struct ip_vs_dest *dest;
-	struct ip_vs_lblc_table *tbl;
-	struct ip_vs_lblc_entry *en;
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
 	struct iphdr *iph = ip_hdr(skb);
+	struct ip_vs_dest *dest = NULL;
+	struct ip_vs_lblc_entry *en;
 
 	IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
 
-	tbl = (struct ip_vs_lblc_table *)svc->sched_data;
+	/* First look in our cache */
+	read_lock(&svc->sched_lock);
 	en = ip_vs_lblc_get(tbl, iph->daddr);
-	if (en == NULL) {
-		dest = __ip_vs_lblc_schedule(svc, iph);
-		if (dest == NULL) {
-			IP_VS_DBG(1, "no destination available\n");
-			return NULL;
-		}
-		en = ip_vs_lblc_new(iph->daddr, dest);
-		if (en == NULL) {
-			return NULL;
-		}
-		ip_vs_lblc_hash(tbl, en);
-	} else {
-		dest = en->dest;
-		if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
-		    || atomic_read(&dest->weight) <= 0
-		    || is_overloaded(dest, svc)) {
-			dest = __ip_vs_lblc_schedule(svc, iph);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "no destination available\n");
-				return NULL;
-			}
-			atomic_dec(&en->dest->refcnt);
-			atomic_inc(&dest->refcnt);
-			en->dest = dest;
-		}
+	if (en) {
+		/* We only hold a read lock, but this is atomic */
+		en->lastuse = jiffies;
+
+		/*
+		 * If the destination is not available, i.e. it's in the trash,
+		 * we must ignore it, as it may be removed from under our feet,
+		 * if someone drops our reference count. Our caller only makes
+		 * sure that destinations, that are not in the trash, are not
+		 * moved to the trash, while we are scheduling. But anyone can
+		 * free up entries from the trash at any time.
+		 */
+
+		if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
+			dest = en->dest;
+	}
+	read_unlock(&svc->sched_lock);
+
+	/* If the destination has a weight and is not overloaded, use it */
+	if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+		goto out;
+
+	/* No cache entry or it is invalid, time to schedule */
+	dest = __ip_vs_lblc_schedule(svc, iph);
+	if (!dest) {
+		IP_VS_DBG(1, "no destination available\n");
+		return NULL;
 	}
-	en->lastuse = jiffies;
 
+	/* If we fail to create a cache entry, we'll just use the valid dest */
+	write_lock(&svc->sched_lock);
+	ip_vs_lblc_new(tbl, iph->daddr, dest);
+	write_unlock(&svc->sched_lock);
+
+out:
 	IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
 		  "--> server %u.%u.%u.%u:%d\n",
-		  NIPQUAD(en->addr),
+		  NIPQUAD(iph->daddr),
 		  NIPQUAD(dest->addr),
 		  ntohs(dest->port));
 
-- 
cgit v1.2.3


From f728bafb5698076dd35bca35ee6cfe52ea1b8ab2 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Tue, 19 Aug 2008 08:16:19 +0200
Subject: ipvs: Fix race conditions in lblcr scheduler

We can't access the cache entry outside of our critical read-locked region,
because someone may free that entry. Also getting an entry under read lock,
then locking for write and trying to delete that entry looks fishy, but should
be no problem here, because we're only comparing a pointer. Also there is no
need for our own rwlock, there is already one in the service structure for use
in the schedulers.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_lblcr.c | 229 ++++++++++++++++++++++----------------------
 1 file changed, 114 insertions(+), 115 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index f1c84503689..375a1ffb6b6 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
 			return NULL;
 	}
 
-	e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+	e = kmalloc(sizeof(*e), GFP_ATOMIC);
 	if (e == NULL) {
 		IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
 		return NULL;
@@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
 	e->dest = dest;
 
 	/* link it to the list */
-	write_lock(&set->lock);
 	e->next = set->list;
 	set->list = e;
 	atomic_inc(&set->size);
-	write_unlock(&set->lock);
 
 	set->lastmod = jiffies;
 	return e;
@@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
 {
 	struct ip_vs_dest_list *e, **ep;
 
-	write_lock(&set->lock);
 	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
 		if (e->dest == dest) {
 			/* HIT */
@@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
 		}
 		ep = &e->next;
 	}
-	write_unlock(&set->lock);
 }
 
 static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
@@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
 	if (set == NULL)
 		return NULL;
 
-	read_lock(&set->lock);
 	/* select the first destination server, whose weight > 0 */
 	for (e=set->list; e!=NULL; e=e->next) {
 		least = e->dest;
@@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
 			goto nextstage;
 		}
 	}
-	read_unlock(&set->lock);
 	return NULL;
 
 	/* find the destination with the weighted least load */
@@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
 			loh = doh;
 		}
 	}
-	read_unlock(&set->lock);
 
 	IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
@@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
 	if (set == NULL)
 		return NULL;
 
-	read_lock(&set->lock);
 	/* select the first destination server, whose weight > 0 */
 	for (e=set->list; e!=NULL; e=e->next) {
 		most = e->dest;
@@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
 			goto nextstage;
 		}
 	}
-	read_unlock(&set->lock);
 	return NULL;
 
 	/* find the destination with the weighted most load */
@@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
 			moh = doh;
 		}
 	}
-	read_unlock(&set->lock);
 
 	IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
@@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
  *      IPVS lblcr hash table
  */
 struct ip_vs_lblcr_table {
-	rwlock_t	        lock;           /* lock for this table */
 	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
 	atomic_t                entries;        /* number of entries */
 	int                     max_size;       /* maximum size of entries */
@@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
 
 static struct ctl_table_header * sysctl_header;
 
-/*
- *      new/free a ip_vs_lblcr_entry, which is a mapping of a destination
- *      IP address to a server.
- */
-static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
-{
-	struct ip_vs_lblcr_entry *en;
-
-	en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
-	if (en == NULL) {
-		IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
-		return NULL;
-	}
-
-	INIT_LIST_HEAD(&en->list);
-	en->addr = daddr;
-
-	/* initilize its dest set */
-	atomic_set(&(en->set.size), 0);
-	en->set.list = NULL;
-	rwlock_init(&en->set.lock);
-
-	return en;
-}
-
-
 static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
 {
 	list_del(&en->list);
@@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
  *	Hash an entry in the ip_vs_lblcr_table.
  *	returns bool success.
  */
-static int
+static void
 ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
 {
-	unsigned hash;
-
-	if (!list_empty(&en->list)) {
-		IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
-			  "called from %p\n", __builtin_return_address(0));
-		return 0;
-	}
+	unsigned hash = ip_vs_lblcr_hashkey(en->addr);
 
-	/*
-	 *	Hash by destination IP address
-	 */
-	hash = ip_vs_lblcr_hashkey(en->addr);
-
-	write_lock(&tbl->lock);
 	list_add(&en->list, &tbl->bucket[hash]);
 	atomic_inc(&tbl->entries);
-	write_unlock(&tbl->lock);
-
-	return 1;
 }
 
 
 /*
- *  Get ip_vs_lblcr_entry associated with supplied parameters.
+ *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
+ *  read lock.
  */
 static inline struct ip_vs_lblcr_entry *
 ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
 {
-	unsigned hash;
+	unsigned hash = ip_vs_lblcr_hashkey(addr);
 	struct ip_vs_lblcr_entry *en;
 
-	hash = ip_vs_lblcr_hashkey(addr);
+	list_for_each_entry(en, &tbl->bucket[hash], list)
+		if (en->addr == addr)
+			return en;
+
+	return NULL;
+}
 
-	read_lock(&tbl->lock);
 
-	list_for_each_entry(en, &tbl->bucket[hash], list) {
-		if (en->addr == addr) {
-			/* HIT */
-			read_unlock(&tbl->lock);
-			return en;
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl,  __be32 daddr,
+		struct ip_vs_dest *dest)
+{
+	struct ip_vs_lblcr_entry *en;
+
+	en = ip_vs_lblcr_get(tbl, daddr);
+	if (!en) {
+		en = kmalloc(sizeof(*en), GFP_ATOMIC);
+		if (!en) {
+			IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+			return NULL;
 		}
+
+		en->addr = daddr;
+		en->lastuse = jiffies;
+
+		/* initilize its dest set */
+		atomic_set(&(en->set.size), 0);
+		en->set.list = NULL;
+		rwlock_init(&en->set.lock);
+
+		ip_vs_lblcr_hash(tbl, en);
 	}
 
-	read_unlock(&tbl->lock);
+	write_lock(&en->set.lock);
+	ip_vs_dest_set_insert(&en->set, dest);
+	write_unlock(&en->set.lock);
 
-	return NULL;
+	return en;
 }
 
 
@@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
 	int i;
 	struct ip_vs_lblcr_entry *en, *nxt;
 
+	/* No locking required, only called during cleanup. */
 	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-		write_lock(&tbl->lock);
 		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
 			ip_vs_lblcr_free(en);
-			atomic_dec(&tbl->entries);
 		}
-		write_unlock(&tbl->lock);
 	}
 }
 
 
-static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
 {
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
 	unsigned long now = jiffies;
 	int i, j;
 	struct ip_vs_lblcr_entry *en, *nxt;
@@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
 	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
-		write_lock(&tbl->lock);
+		write_lock(&svc->sched_lock);
 		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
 			if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
 				       now))
@@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
 			ip_vs_lblcr_free(en);
 			atomic_dec(&tbl->entries);
 		}
-		write_unlock(&tbl->lock);
+		write_unlock(&svc->sched_lock);
 	}
 	tbl->rover = j;
 }
@@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
  */
 static void ip_vs_lblcr_check_expire(unsigned long data)
 {
-	struct ip_vs_lblcr_table *tbl;
+	struct ip_vs_service *svc = (struct ip_vs_service *) data;
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
 	unsigned long now = jiffies;
 	int goal;
 	int i, j;
 	struct ip_vs_lblcr_entry *en, *nxt;
 
-	tbl = (struct ip_vs_lblcr_table *)data;
-
 	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
 		/* do full expiration check */
-		ip_vs_lblcr_full_check(tbl);
+		ip_vs_lblcr_full_check(svc);
 		tbl->counter = 1;
 		goto out;
 	}
@@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
 	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
-		write_lock(&tbl->lock);
+		write_lock(&svc->sched_lock);
 		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
 			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
 				continue;
@@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
 			atomic_dec(&tbl->entries);
 			goal--;
 		}
-		write_unlock(&tbl->lock);
+		write_unlock(&svc->sched_lock);
 		if (goal <= 0)
 			break;
 	}
@@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	/*
 	 *    Allocate the ip_vs_lblcr_table for this service
 	 */
-	tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
 	if (tbl == NULL) {
 		IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
 		return -ENOMEM;
 	}
 	svc->sched_data = tbl;
 	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
-		  "current service\n",
-		  sizeof(struct ip_vs_lblcr_table));
+		  "current service\n", sizeof(*tbl));
 
 	/*
 	 *    Initialize the hash buckets
@@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		INIT_LIST_HEAD(&tbl->bucket[i]);
 	}
-	rwlock_init(&tbl->lock);
 	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
@@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	 *    Hook periodic timer for garbage collection
 	 */
 	setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
-			(unsigned long)tbl);
-	tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
-	add_timer(&tbl->periodic_timer);
+			(unsigned long)svc);
+	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
 
 	return 0;
 }
@@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 	ip_vs_lblcr_flush(tbl);
 
 	/* release the table itself */
-	kfree(svc->sched_data);
+	kfree(tbl);
 	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
-		  sizeof(struct ip_vs_lblcr_table));
+		  sizeof(*tbl));
 
 	return 0;
 }
@@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 static struct ip_vs_dest *
 ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
-	struct ip_vs_dest *dest;
-	struct ip_vs_lblcr_table *tbl;
-	struct ip_vs_lblcr_entry *en;
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
 	struct iphdr *iph = ip_hdr(skb);
+	struct ip_vs_dest *dest = NULL;
+	struct ip_vs_lblcr_entry *en;
 
 	IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
 
-	tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+	/* First look in our cache */
+	read_lock(&svc->sched_lock);
 	en = ip_vs_lblcr_get(tbl, iph->daddr);
-	if (en == NULL) {
-		dest = __ip_vs_lblcr_schedule(svc, iph);
-		if (dest == NULL) {
-			IP_VS_DBG(1, "no destination available\n");
-			return NULL;
-		}
-		en = ip_vs_lblcr_new(iph->daddr);
-		if (en == NULL) {
-			return NULL;
-		}
-		ip_vs_dest_set_insert(&en->set, dest);
-		ip_vs_lblcr_hash(tbl, en);
-	} else {
+	if (en) {
+		/* We only hold a read lock, but this is atomic */
+		en->lastuse = jiffies;
+
+		/* Get the least loaded destination */
+		read_lock(&en->set.lock);
 		dest = ip_vs_dest_set_min(&en->set);
-		if (!dest || is_overloaded(dest, svc)) {
-			dest = __ip_vs_lblcr_schedule(svc, iph);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "no destination available\n");
-				return NULL;
-			}
-			ip_vs_dest_set_insert(&en->set, dest);
-		}
+		read_unlock(&en->set.lock);
+
+		/* More than one destination + enough time passed by, cleanup */
 		if (atomic_read(&en->set.size) > 1 &&
-		    jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+				time_after(jiffies, en->set.lastmod +
+				sysctl_ip_vs_lblcr_expiration)) {
 			struct ip_vs_dest *m;
+
+			write_lock(&en->set.lock);
 			m = ip_vs_dest_set_max(&en->set);
 			if (m)
 				ip_vs_dest_set_erase(&en->set, m);
+			write_unlock(&en->set.lock);
+		}
+
+		/* If the destination is not overloaded, use it */
+		if (dest && !is_overloaded(dest, svc)) {
+			read_unlock(&svc->sched_lock);
+			goto out;
+		}
+
+		/* The cache entry is invalid, time to schedule */
+		dest = __ip_vs_lblcr_schedule(svc, iph);
+		if (!dest) {
+			IP_VS_DBG(1, "no destination available\n");
+			read_unlock(&svc->sched_lock);
+			return NULL;
 		}
+
+		/* Update our cache entry */
+		write_lock(&en->set.lock);
+		ip_vs_dest_set_insert(&en->set, dest);
+		write_unlock(&en->set.lock);
+	}
+	read_unlock(&svc->sched_lock);
+
+	if (dest)
+		goto out;
+
+	/* No cache entry, time to schedule */
+	dest = __ip_vs_lblcr_schedule(svc, iph);
+	if (!dest) {
+		IP_VS_DBG(1, "no destination available\n");
+		return NULL;
 	}
-	en->lastuse = jiffies;
 
+	/* If we fail to create a cache entry, we'll just use the valid dest */
+	write_lock(&svc->sched_lock);
+	ip_vs_lblcr_new(tbl, iph->daddr, dest);
+	write_unlock(&svc->sched_lock);
+
+out:
 	IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
 		  "--> server %u.%u.%u.%u:%d\n",
-		  NIPQUAD(en->addr),
+		  NIPQUAD(iph->daddr),
 		  NIPQUAD(dest->addr),
 		  ntohs(dest->port));
 
-- 
cgit v1.2.3


From d92a8e81e097968d8f2bac0581a0a43bff14b8f0 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 16 Jul 2008 16:34:54 +0200
Subject: net/ieee80211: adjust error handling

Converts a test in error handling code to a sequence of labels.

The semantic match that found the problem is:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@@
expression E,E1,E2;
@@

E = alloc_etherdev(...)
... when != E = E1
if (...) { ... free_netdev(E); ... return ...; }
... when != E = E2
(
  if (...)
   {
   ... when != free_netdev(E);
   return dev; }
|
* if (...)
   {
   ... when != free_netdev(E);
   return ...; }
|
register_netdev(E)
)

// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/ieee80211/ieee80211_module.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c
index 3bca97f55d4..949772a5a7d 100644
--- a/net/ieee80211/ieee80211_module.c
+++ b/net/ieee80211/ieee80211_module.c
@@ -157,7 +157,7 @@ struct net_device *alloc_ieee80211(int sizeof_priv)
 	err = ieee80211_networks_allocate(ieee);
 	if (err) {
 		IEEE80211_ERROR("Unable to allocate beacon storage: %d\n", err);
-		goto failed;
+		goto failed_free_netdev;
 	}
 	ieee80211_networks_initialize(ieee);
 
@@ -193,9 +193,9 @@ struct net_device *alloc_ieee80211(int sizeof_priv)
 
 	return dev;
 
-      failed:
-	if (dev)
-		free_netdev(dev);
+failed_free_netdev:
+	free_netdev(dev);
+failed:
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 5442060c08a49bd0b416f033e0ae43ccedef5278 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Thu, 24 Jul 2008 12:20:09 -0400
Subject: WIRELESS: Make wireless one-click selectable.

Use "menuconfig" to make wireless support one-click selectable.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/Kconfig | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index 7612cc8c337..d87de48ba65 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -236,14 +236,18 @@ source "net/rxrpc/Kconfig"
 config FIB_RULES
 	bool
 
-menu "Wireless"
+menuconfig WIRELESS
+	bool "Wireless"
 	depends on !S390
+	default y
+
+if WIRELESS
 
 source "net/wireless/Kconfig"
 source "net/mac80211/Kconfig"
 source "net/ieee80211/Kconfig"
 
-endmenu
+endif # WIRELESS
 
 source "net/rfkill/Kconfig"
 source "net/9p/Kconfig"
-- 
cgit v1.2.3


From 92ab85354993ac3a364c65cab45745af470ffc67 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 24 Jul 2008 21:02:04 +0300
Subject: mac80211: add ieee80211_queue_stopped)

This patch adds ieee80211_queue_stopped that let drivers to query
queue status

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/util.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 0d463c80c40..24400618190 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -386,6 +386,13 @@ void ieee80211_stop_queues(struct ieee80211_hw *hw)
 }
 EXPORT_SYMBOL(ieee80211_stop_queues);
 
+int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	return __netif_subqueue_stopped(local->mdev, queue);
+}
+EXPORT_SYMBOL(ieee80211_queue_stopped);
+
 void ieee80211_wake_queues(struct ieee80211_hw *hw)
 {
 	int i;
-- 
cgit v1.2.3


From b4f28bbb9bf0b2c829ecf97ce2173f204fde4f10 Mon Sep 17 00:00:00 2001
From: Bruno Randolf <br1@einfach.org>
Date: Wed, 30 Jul 2008 17:19:55 +0200
Subject: mac80211: add rx status flag for short preamble

and use it for the radiotap header

Signed-off-by: Bruno Randolf <br1@einfach.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6db85450519..ad47a614a20 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -143,6 +143,8 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	/* IEEE80211_RADIOTAP_FLAGS */
 	if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
 		*pos |= IEEE80211_RADIOTAP_F_FCS;
+	if (status->flag & RX_FLAG_SHORTPRE)
+		*pos |= IEEE80211_RADIOTAP_F_SHORTPRE;
 	pos++;
 
 	/* IEEE80211_RADIOTAP_RATE */
-- 
cgit v1.2.3


From 9deb1ae572364a37d054d916c5bae858f91a3f9a Mon Sep 17 00:00:00 2001
From: Bruno Randolf <br1@einfach.org>
Date: Wed, 30 Jul 2008 17:20:06 +0200
Subject: mac80211: radiotap: assume modulation from rates

use the rates ERP flag to derive CCK or OFDM modulation for the radiotap
header.

(it might be more correct to get this information from the hardware itself, but it
seems safe to assume this in most practical cases.)

Signed-off-by: Bruno Randolf <br1@einfach.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index ad47a614a20..60e9ea11115 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -157,8 +157,11 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	if (status->band == IEEE80211_BAND_5GHZ)
 		*(__le16 *)pos = cpu_to_le16(IEEE80211_CHAN_OFDM |
 					     IEEE80211_CHAN_5GHZ);
+	else if (rate->flags & IEEE80211_RATE_ERP_G)
+		*(__le16 *)pos = cpu_to_le16(IEEE80211_CHAN_OFDM |
+					     IEEE80211_CHAN_2GHZ);
 	else
-		*(__le16 *)pos = cpu_to_le16(IEEE80211_CHAN_DYN |
+		*(__le16 *)pos = cpu_to_le16(IEEE80211_CHAN_CCK |
 					     IEEE80211_CHAN_2GHZ);
 	pos += 2;
 
-- 
cgit v1.2.3


From 62bf1d762e24006fa9b6c8d56a22cf47a2310af3 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:05 -0700
Subject: mac80211: explicitly check skb->len

ieee80211_get_hdrlen_from_skb internally checks the skb is long enough to
hold the full ieee80211_hdr, else it returns zero.  Use ieee80211_hdrlen
which always returns the hdrlen and check the remaining room in the
skb explicitly when removing encryption headers or the qos control field.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index aa5a191598c..f5537f90dd3 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1244,9 +1244,10 @@ static void ieee80211_remove_tx_extra(struct ieee80211_local *local,
 				      struct ieee80211_key *key,
 				      struct sk_buff *skb)
 {
-	int hdrlen, iv_len, mic_len;
+	unsigned int hdrlen, iv_len, mic_len;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 
-	hdrlen = ieee80211_get_hdrlen_from_skb(skb);
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
 
 	if (!key)
 		goto no_key;
@@ -1268,24 +1269,20 @@ static void ieee80211_remove_tx_extra(struct ieee80211_local *local,
 		goto no_key;
 	}
 
-	if (skb->len >= mic_len &&
+	if (skb->len >= hdrlen + mic_len &&
 	    !(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
 		skb_trim(skb, skb->len - mic_len);
-	if (skb->len >= iv_len && skb->len > hdrlen) {
+	if (skb->len >= hdrlen + iv_len) {
 		memmove(skb->data + iv_len, skb->data, hdrlen);
-		skb_pull(skb, iv_len);
+		hdr = (struct ieee80211_hdr *)skb_pull(skb, iv_len);
 	}
 
 no_key:
-	{
-		struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-		u16 fc = le16_to_cpu(hdr->frame_control);
-		if ((fc & 0x8C) == 0x88) /* QoS Control Field */ {
-			fc &= ~IEEE80211_STYPE_QOS_DATA;
-			hdr->frame_control = cpu_to_le16(fc);
-			memmove(skb->data + 2, skb->data, hdrlen - 2);
-			skb_pull(skb, 2);
-		}
+	if (ieee80211_is_data_qos(hdr->frame_control)) {
+		hdr->frame_control &= ~cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+		memmove(skb->data + IEEE80211_QOS_CTL_LEN, skb->data,
+			hdrlen - IEEE80211_QOS_CTL_LEN);
+		skb_pull(skb, IEEE80211_QOS_CTL_LEN);
 	}
 }
 
-- 
cgit v1.2.3


From c44d040e186e1af9d947f7bad886b7c1d35a22f6 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:07 -0700
Subject: mac80211: wme.h remove unused QOS_CONTROL_LEN

linux/ieee80211.h now has IEEE80211_QOS_CTL_LEN for this purpose.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wme.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
index 04de28c071a..465e274df7c 100644
--- a/net/mac80211/wme.h
+++ b/net/mac80211/wme.h
@@ -14,8 +14,6 @@
 #include <linux/netdevice.h>
 #include "ieee80211_i.h"
 
-#define QOS_CONTROL_LEN 2
-
 #define QOS_CONTROL_ACK_POLICY_NORMAL 0
 #define QOS_CONTROL_ACK_POLICY_NOACK 1
 
-- 
cgit v1.2.3


From d298487260d01934a8df3a4a2a09513d84a8e69b Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:10 -0700
Subject: mac80211: wep.c replace magic numbers in IV/ICV removal

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wep.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 5c2bf0a3d4d..639998775b4 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -228,11 +228,10 @@ int ieee80211_wep_decrypt(struct ieee80211_local *local, struct sk_buff *skb,
 		return -1;
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
-
-	if (skb->len < 8 + hdrlen)
+	if (skb->len < hdrlen + WEP_IV_LEN + WEP_ICV_LEN)
 		return -1;
 
-	len = skb->len - hdrlen - 8;
+	len = skb->len - hdrlen - WEP_IV_LEN - WEP_ICV_LEN;
 
 	keyidx = skb->data[hdrlen + 3] >> 6;
 
@@ -303,7 +302,7 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
 	} else if (!(rx->status->flag & RX_FLAG_IV_STRIPPED)) {
 		ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
 		/* remove ICV */
-		skb_trim(rx->skb, rx->skb->len - 4);
+		skb_trim(rx->skb, rx->skb->len - WEP_ICV_LEN);
 	}
 
 	return RX_CONTINUE;
-- 
cgit v1.2.3


From b73d70ad8665fd3f35c855075b9a94de3e2b69e2 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:12 -0700
Subject: mac80211: rx.c/tx.c remove more users of tx/rx_data->fc

Those functions that still use ieee80211_get_hdrlen are moved over
to use the little endian frame control.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 62 +++++++++++++++++++++----------------------------------
 net/mac80211/tx.c |  2 +-
 2 files changed, 25 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 60e9ea11115..4e9631c140a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -821,7 +821,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
 
 static inline struct ieee80211_fragment_entry *
 ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
-			  u16 fc, unsigned int frag, unsigned int seq,
+			  unsigned int frag, unsigned int seq,
 			  int rx_queue, struct ieee80211_hdr *hdr)
 {
 	struct ieee80211_fragment_entry *entry;
@@ -830,7 +830,6 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
 	idx = sdata->fragment_next;
 	for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) {
 		struct ieee80211_hdr *f_hdr;
-		u16 f_fc;
 
 		idx--;
 		if (idx < 0)
@@ -842,10 +841,13 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
 		    entry->last_frag + 1 != frag)
 			continue;
 
-		f_hdr = (struct ieee80211_hdr *) entry->skb_list.next->data;
-		f_fc = le16_to_cpu(f_hdr->frame_control);
+		f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data;
 
-		if ((fc & IEEE80211_FCTL_FTYPE) != (f_fc & IEEE80211_FCTL_FTYPE) ||
+		/*
+		 * Check ftype and addresses are equal, else check next fragment
+		 */
+		if (((hdr->frame_control ^ f_hdr->frame_control) &
+		     cpu_to_le16(IEEE80211_FCTL_FTYPE)) ||
 		    compare_ether_addr(hdr->addr1, f_hdr->addr1) != 0 ||
 		    compare_ether_addr(hdr->addr2, f_hdr->addr2) != 0)
 			continue;
@@ -870,11 +872,11 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	struct sk_buff *skb;
 	DECLARE_MAC_BUF(mac);
 
-	hdr = (struct ieee80211_hdr *) rx->skb->data;
+	hdr = (struct ieee80211_hdr *)rx->skb->data;
 	sc = le16_to_cpu(hdr->seq_ctrl);
 	frag = sc & IEEE80211_SCTL_FRAG;
 
-	if (likely((!(rx->fc & IEEE80211_FCTL_MOREFRAGS) && frag == 0) ||
+	if (likely((!ieee80211_has_morefrags(hdr->frame_control) && frag == 0) ||
 		   (rx->skb)->len < 24 ||
 		   is_multicast_ether_addr(hdr->addr1))) {
 		/* not fragmented */
@@ -889,7 +891,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
 						 rx->queue, &(rx->skb));
 		if (rx->key && rx->key->conf.alg == ALG_CCMP &&
-		    (rx->fc & IEEE80211_FCTL_PROTECTED)) {
+		    ieee80211_has_protected(hdr->frame_control)) {
 			/* Store CCMP PN so that we can verify that the next
 			 * fragment has a sequential PN value. */
 			entry->ccmp = 1;
@@ -903,8 +905,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	/* This is a fragment for a frame that should already be pending in
 	 * fragment cache. Add this fragment to the end of the pending entry.
 	 */
-	entry = ieee80211_reassemble_find(rx->sdata, rx->fc, frag, seq,
-					  rx->queue, hdr);
+	entry = ieee80211_reassemble_find(rx->sdata, frag, seq, rx->queue, hdr);
 	if (!entry) {
 		I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
 		return RX_DROP_MONITOR;
@@ -929,7 +930,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		memcpy(entry->last_pn, pn, CCMP_PN_LEN);
 	}
 
-	skb_pull(rx->skb, ieee80211_get_hdrlen(rx->fc));
+	skb_pull(rx->skb, ieee80211_hdrlen(hdr->frame_control));
 	__skb_queue_tail(&entry->skb_list, rx->skb);
 	entry->last_frag = frag;
 	entry->extra_len += rx->skb->len;
@@ -1096,7 +1097,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 {
 	struct net_device *dev = rx->dev;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data;
-	u16 fc, hdrlen, ethertype;
+	u16 hdrlen, ethertype;
 	u8 *payload;
 	u8 dst[ETH_ALEN];
 	u8 src[ETH_ALEN] __aligned(2);
@@ -1107,12 +1108,10 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 	DECLARE_MAC_BUF(mac3);
 	DECLARE_MAC_BUF(mac4);
 
-	fc = rx->fc;
-
-	if (unlikely(!WLAN_FC_DATA_PRESENT(fc)))
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
 		return -1;
 
-	hdrlen = ieee80211_get_hdrlen(fc);
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
 
 	if (ieee80211_vif_is_mesh(&sdata->vif))
 		hdrlen += ieee80211_get_mesh_hdrlen(
@@ -1127,41 +1126,28 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 	 *   1     0   BSSID SA    DA    n/a
 	 *   1     1   RA    TA    DA    SA
 	 */
+	memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN);
+	memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN);
 
-	switch (fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
-	case IEEE80211_FCTL_TODS:
-		/* BSSID SA DA */
-		memcpy(dst, hdr->addr3, ETH_ALEN);
-		memcpy(src, hdr->addr2, ETH_ALEN);
-
+	switch (hdr->frame_control &
+		cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
+	case __constant_cpu_to_le16(IEEE80211_FCTL_TODS):
 		if (unlikely(sdata->vif.type != IEEE80211_IF_TYPE_AP &&
 			     sdata->vif.type != IEEE80211_IF_TYPE_VLAN))
 			return -1;
 		break;
-	case (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
-		/* RA TA DA SA */
-		memcpy(dst, hdr->addr3, ETH_ALEN);
-		memcpy(src, hdr->addr4, ETH_ALEN);
-
-		 if (unlikely(sdata->vif.type != IEEE80211_IF_TYPE_WDS &&
+	case __constant_cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
+		if (unlikely(sdata->vif.type != IEEE80211_IF_TYPE_WDS &&
 			     sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT))
 			return -1;
 		break;
-	case IEEE80211_FCTL_FROMDS:
-		/* DA BSSID SA */
-		memcpy(dst, hdr->addr1, ETH_ALEN);
-		memcpy(src, hdr->addr3, ETH_ALEN);
-
+	case __constant_cpu_to_le16(IEEE80211_FCTL_FROMDS):
 		if (sdata->vif.type != IEEE80211_IF_TYPE_STA ||
 		    (is_multicast_ether_addr(dst) &&
 		     !compare_ether_addr(src, dev->dev_addr)))
 			return -1;
 		break;
-	case 0:
-		/* DA SA BSSID */
-		memcpy(dst, hdr->addr1, ETH_ALEN);
-		memcpy(src, hdr->addr2, ETH_ALEN);
-
+	case __constant_cpu_to_le16(0):
 		if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
 			return -1;
 		break;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 4788f7b91f4..24146f3387a 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1025,7 +1025,7 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 	else if (test_and_clear_sta_flags(tx->sta, WLAN_STA_CLEAR_PS_FILT))
 		info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT;
 
-	hdrlen = ieee80211_get_hdrlen(tx->fc);
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
 	if (skb->len > hdrlen + sizeof(rfc1042_header) + 2) {
 		u8 *pos = &skb->data[hdrlen + sizeof(rfc1042_header)];
 		tx->ethertype = (pos[0] << 8) | pos[1];
-- 
cgit v1.2.3


From 6b644e524bbd4089a28e0711de4f1cf2daa5db50 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:12 -0700
Subject: mac80211: remove ieee80211_get_hdrlen

All users have been moved over to the version taking a le16 frame control
rather than a cpu-endian value.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/util.c | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 24400618190..f40c060341a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -91,45 +91,6 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
 	return NULL;
 }
 
-int ieee80211_get_hdrlen(u16 fc)
-{
-	int hdrlen = 24;
-
-	switch (fc & IEEE80211_FCTL_FTYPE) {
-	case IEEE80211_FTYPE_DATA:
-		if ((fc & IEEE80211_FCTL_FROMDS) && (fc & IEEE80211_FCTL_TODS))
-			hdrlen = 30; /* Addr4 */
-		/*
-		 * The QoS Control field is two bytes and its presence is
-		 * indicated by the IEEE80211_STYPE_QOS_DATA bit. Add 2 to
-		 * hdrlen if that bit is set.
-		 * This works by masking out the bit and shifting it to
-		 * bit position 1 so the result has the value 0 or 2.
-		 */
-		hdrlen += (fc & IEEE80211_STYPE_QOS_DATA)
-				>> (ilog2(IEEE80211_STYPE_QOS_DATA)-1);
-		break;
-	case IEEE80211_FTYPE_CTL:
-		/*
-		 * ACK and CTS are 10 bytes, all others 16. To see how
-		 * to get this condition consider
-		 *   subtype mask:   0b0000000011110000 (0x00F0)
-		 *   ACK subtype:    0b0000000011010000 (0x00D0)
-		 *   CTS subtype:    0b0000000011000000 (0x00C0)
-		 *   bits that matter:         ^^^      (0x00E0)
-		 *   value of those: 0b0000000011000000 (0x00C0)
-		 */
-		if ((fc & 0xE0) == 0xC0)
-			hdrlen = 10;
-		else
-			hdrlen = 16;
-		break;
-	}
-
-	return hdrlen;
-}
-EXPORT_SYMBOL(ieee80211_get_hdrlen);
-
 unsigned int ieee80211_hdrlen(__le16 fc)
 {
 	unsigned int hdrlen = 24;
-- 
cgit v1.2.3


From e7827a7031a931c74c48e4a53f73ed862f0c8da0 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:13 -0700
Subject: mac80211: remove IEEE80211_FC helper

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  2 --
 net/mac80211/mesh_hwmp.c   |  8 ++++----
 net/mac80211/mesh_plink.c  |  4 ++--
 net/mac80211/mlme.c        | 46 +++++++++++++++++++++++-----------------------
 net/mac80211/tx.c          |  4 ++--
 5 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ec59345af65..a2870bc245d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -38,8 +38,6 @@
 
 #define WLAN_FC_DATA_PRESENT(fc) (((fc) & 0x4c) == 0x08)
 
-#define IEEE80211_FC(type, subtype) cpu_to_le16(type | subtype)
-
 struct ieee80211_local;
 
 /* Maximum number of broadcast/multicast frames to buffer when some of the
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 08aca446ca0..2cdbd522631 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -99,8 +99,8 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 	mgmt = (struct ieee80211_mgmt *)
 		skb_put(skb, 25 + sizeof(mgmt->u.action.u.mesh_action));
 	memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.mesh_action));
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_ACTION);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
 
 	memcpy(mgmt->da, da, ETH_ALEN);
 	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
@@ -178,8 +178,8 @@ int mesh_path_error_tx(u8 *dst, __le32 dst_dsn, u8 *ra,
 	mgmt = (struct ieee80211_mgmt *)
 		skb_put(skb, 25 + sizeof(mgmt->u.action.u.mesh_action));
 	memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.mesh_action));
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_ACTION);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
 
 	memcpy(mgmt->da, ra, ETH_ALEN);
 	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 9efeb1f0702..4a7e6d094ff 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -163,8 +163,8 @@ static int mesh_plink_frame_tx(struct net_device *dev,
 	mgmt = (struct ieee80211_mgmt *)
 		skb_put(skb, 25 + sizeof(mgmt->u.action.u.plink_action));
 	memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.plink_action));
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_ACTION);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
 	memcpy(mgmt->da, da, ETH_ALEN);
 	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
 	/* BSSID is left zeroed, wildcard value */
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 1e97fb9fb34..ac776c9d48f 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -641,8 +641,8 @@ static void ieee80211_send_auth(struct net_device *dev,
 
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
 	memset(mgmt, 0, 24 + 6);
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_AUTH);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_AUTH);
 	if (encrypt)
 		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
@@ -771,8 +771,8 @@ static void ieee80211_send_assoc(struct net_device *dev,
 
 	if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
 		skb_put(skb, 10);
-		mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-						   IEEE80211_STYPE_REASSOC_REQ);
+		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_REASSOC_REQ);
 		mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
 		mgmt->u.reassoc_req.listen_interval =
 				cpu_to_le16(local->hw.conf.listen_interval);
@@ -780,8 +780,8 @@ static void ieee80211_send_assoc(struct net_device *dev,
 		       ETH_ALEN);
 	} else {
 		skb_put(skb, 4);
-		mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-						   IEEE80211_STYPE_ASSOC_REQ);
+		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_ASSOC_REQ);
 		mgmt->u.assoc_req.capab_info = cpu_to_le16(capab);
 		mgmt->u.reassoc_req.listen_interval =
 				cpu_to_le16(local->hw.conf.listen_interval);
@@ -931,8 +931,8 @@ static void ieee80211_send_deauth(struct net_device *dev,
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
 	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
 	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_DEAUTH);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_DEAUTH);
 	skb_put(skb, 2);
 	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
 
@@ -960,8 +960,8 @@ static void ieee80211_send_disassoc(struct net_device *dev,
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
 	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
 	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_DISASSOC);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_DISASSOC);
 	skb_put(skb, 2);
 	mgmt->u.disassoc.reason_code = cpu_to_le16(reason);
 
@@ -1115,8 +1115,8 @@ static void ieee80211_send_probe_req(struct net_device *dev, u8 *dst,
 
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_PROBE_REQ);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_PROBE_REQ);
 	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
 	if (dst) {
 		memcpy(mgmt->da, dst, ETH_ALEN);
@@ -1219,8 +1219,8 @@ static void ieee80211_send_addba_resp(struct net_device *dev, u8 *da, u16 tid,
 		memcpy(mgmt->bssid, dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					   IEEE80211_STYPE_ACTION);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
 
 	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
 	mgmt->u.action.category = WLAN_CATEGORY_BACK;
@@ -1268,8 +1268,8 @@ void ieee80211_send_addba_request(struct net_device *dev, const u8 *da,
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					IEEE80211_STYPE_ACTION);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
 
 	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req));
 
@@ -1524,8 +1524,8 @@ void ieee80211_send_delba(struct net_device *dev, const u8 *da, u16 tid,
 		memcpy(mgmt->bssid, dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-					IEEE80211_STYPE_ACTION);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
 
 	skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba));
 
@@ -1556,8 +1556,8 @@ void ieee80211_send_bar(struct net_device *dev, u8 *ra, u16 tid, u16 ssn)
 	skb_reserve(skb, local->hw.extra_tx_headroom);
 	bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar));
 	memset(bar, 0, sizeof(*bar));
-	bar->frame_control = IEEE80211_FC(IEEE80211_FTYPE_CTL,
-					IEEE80211_STYPE_BACK_REQ);
+	bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
+					 IEEE80211_STYPE_BACK_REQ);
 	memcpy(bar->ra, ra, ETH_ALEN);
 	memcpy(bar->ta, dev->dev_addr, ETH_ALEN);
 	bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL;
@@ -1801,7 +1801,7 @@ static void ieee80211_send_refuse_measurement_request(struct net_device *dev,
 	memcpy(msr_report->da, da, ETH_ALEN);
 	memcpy(msr_report->sa, dev->dev_addr, ETH_ALEN);
 	memcpy(msr_report->bssid, bssid, ETH_ALEN);
-	msr_report->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
+	msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 						IEEE80211_STYPE_ACTION);
 
 	skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
@@ -2446,8 +2446,8 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 		mgmt = (struct ieee80211_mgmt *)
 			skb_put(skb, 24 + sizeof(mgmt->u.beacon));
 		memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
-		mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-						   IEEE80211_STYPE_PROBE_RESP);
+		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_PROBE_RESP);
 		memset(mgmt->da, 0xff, ETH_ALEN);
 		memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 24146f3387a..9c60dedc368 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1889,8 +1889,8 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 			goto out;
 
 		hdr = (struct ieee80211_hdr *) skb->data;
-		hdr->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT,
-						  IEEE80211_STYPE_BEACON);
+		hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						 IEEE80211_STYPE_BEACON);
 
 		num_beacons = &ifsta->num_beacons;
 	} else if (ieee80211_vif_is_mesh(&sdata->vif)) {
-- 
cgit v1.2.3


From 358c8d9d332230b14e130b78a6930996cdbf84c2 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:13 -0700
Subject: mac80211: use ieee80211 frame control directly

Remove the last users of the rx/tx_data->fc data members and use the
le16 frame_control from the header directly.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c  | 53 ++++++++++++++++++++++++++---------------------------
 net/mac80211/tx.c  | 35 ++++++++++++++---------------------
 net/mac80211/wep.c |  7 ++++---
 3 files changed, 44 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 4e9631c140a..3a96251379f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -867,16 +867,18 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_hdr *hdr;
 	u16 sc;
+	__le16 fc;
 	unsigned int frag, seq;
 	struct ieee80211_fragment_entry *entry;
 	struct sk_buff *skb;
 	DECLARE_MAC_BUF(mac);
 
 	hdr = (struct ieee80211_hdr *)rx->skb->data;
+	fc = hdr->frame_control;
 	sc = le16_to_cpu(hdr->seq_ctrl);
 	frag = sc & IEEE80211_SCTL_FRAG;
 
-	if (likely((!ieee80211_has_morefrags(hdr->frame_control) && frag == 0) ||
+	if (likely((!ieee80211_has_morefrags(fc) && frag == 0) ||
 		   (rx->skb)->len < 24 ||
 		   is_multicast_ether_addr(hdr->addr1))) {
 		/* not fragmented */
@@ -891,7 +893,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
 						 rx->queue, &(rx->skb));
 		if (rx->key && rx->key->conf.alg == ALG_CCMP &&
-		    ieee80211_has_protected(hdr->frame_control)) {
+		    ieee80211_has_protected(fc)) {
 			/* Store CCMP PN so that we can verify that the next
 			 * fragment has a sequential PN value. */
 			entry->ccmp = 1;
@@ -930,11 +932,11 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		memcpy(entry->last_pn, pn, CCMP_PN_LEN);
 	}
 
-	skb_pull(rx->skb, ieee80211_hdrlen(hdr->frame_control));
+	skb_pull(rx->skb, ieee80211_hdrlen(fc));
 	__skb_queue_tail(&entry->skb_list, rx->skb);
 	entry->last_frag = frag;
 	entry->extra_len += rx->skb->len;
-	if (rx->fc & IEEE80211_FCTL_MOREFRAGS) {
+	if (ieee80211_has_morefrags(fc)) {
 		rx->skb = NULL;
 		return RX_QUEUED;
 	}
@@ -974,10 +976,9 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
 	struct sk_buff *skb;
 	int no_pending_pkts;
 	DECLARE_MAC_BUF(mac);
+	__le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control;
 
-	if (likely(!rx->sta ||
-		   (rx->fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_CTL ||
-		   (rx->fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_PSPOLL ||
+	if (likely(!rx->sta || !ieee80211_is_pspoll(fc) ||
 		   !(rx->flags & IEEE80211_RX_RA_MATCH)))
 		return RX_CONTINUE;
 
@@ -1073,7 +1074,7 @@ ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx)
 }
 
 static int
-ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx)
+ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
 {
 	/*
 	 * Pass through unencrypted frames if the hardware has
@@ -1083,9 +1084,8 @@ ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx)
 		return 0;
 
 	/* Drop unencrypted frames if key is set. */
-	if (unlikely(!(rx->fc & IEEE80211_FCTL_PROTECTED) &&
-		     (rx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA &&
-		     (rx->fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_NULLFUNC &&
+	if (unlikely(!ieee80211_has_protected(fc) &&
+		     !ieee80211_is_nullfunc(fc) &&
 		     (rx->key || rx->sdata->drop_unencrypted)))
 		return -EACCES;
 
@@ -1184,7 +1184,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 /*
  * requires that rx->skb is a frame with ethernet header
  */
-static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx)
+static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
 {
 	static const u8 pae_group_addr[ETH_ALEN] __aligned(2)
 		= { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 };
@@ -1200,7 +1200,7 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx)
 		return true;
 
 	if (ieee80211_802_1x_port_control(rx) ||
-	    ieee80211_drop_unencrypted(rx))
+	    ieee80211_drop_unencrypted(rx, fc))
 		return false;
 
 	return true;
@@ -1270,20 +1270,21 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 {
 	struct net_device *dev = rx->dev;
 	struct ieee80211_local *local = rx->local;
-	u16 fc, ethertype;
+	u16 ethertype;
 	u8 *payload;
 	struct sk_buff *skb = rx->skb, *frame = NULL;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	__le16 fc = hdr->frame_control;
 	const struct ethhdr *eth;
 	int remaining, err;
 	u8 dst[ETH_ALEN];
 	u8 src[ETH_ALEN];
 	DECLARE_MAC_BUF(mac);
 
-	fc = rx->fc;
-	if (unlikely((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA))
+	if (unlikely(!ieee80211_is_data(fc)))
 		return RX_CONTINUE;
 
-	if (unlikely(!WLAN_FC_DATA_PRESENT(fc)))
+	if (unlikely(!ieee80211_is_data_present(fc)))
 		return RX_DROP_MONITOR;
 
 	if (!(rx->flags & IEEE80211_RX_AMSDU))
@@ -1365,7 +1366,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 			memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
 		}
 
-		if (!ieee80211_frame_allowed(rx)) {
+		if (!ieee80211_frame_allowed(rx, fc)) {
 			if (skb == frame) /* last frame */
 				return RX_DROP_UNUSABLE;
 			dev_kfree_skb(frame);
@@ -1439,21 +1440,21 @@ static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
 {
 	struct net_device *dev = rx->dev;
-	u16 fc;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	__le16 fc = hdr->frame_control;
 	int err;
 
-	fc = rx->fc;
-	if (unlikely((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA))
+	if (unlikely(!ieee80211_is_data(hdr->frame_control)))
 		return RX_CONTINUE;
 
-	if (unlikely(!WLAN_FC_DATA_PRESENT(fc)))
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
 		return RX_DROP_MONITOR;
 
 	err = ieee80211_data_to_8023(rx);
 	if (unlikely(err))
 		return RX_DROP_UNUSABLE;
 
-	if (!ieee80211_frame_allowed(rx))
+	if (!ieee80211_frame_allowed(rx, fc))
 		return RX_DROP_MONITOR;
 
 	rx->skb->dev = dev;
@@ -1818,13 +1819,12 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_hdr *hdr;
 	struct ieee80211_rx_data rx;
-	u16 type;
 	int prepares;
 	struct ieee80211_sub_if_data *prev = NULL;
 	struct sk_buff *skb_new;
 	u8 *bssid;
 
-	hdr = (struct ieee80211_hdr *) skb->data;
+	hdr = (struct ieee80211_hdr *)skb->data;
 	memset(&rx, 0, sizeof(rx));
 	rx.skb = skb;
 	rx.local = local;
@@ -1832,9 +1832,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 	rx.status = status;
 	rx.rate = rate;
 	rx.fc = le16_to_cpu(hdr->frame_control);
-	type = rx.fc & IEEE80211_FCTL_FTYPE;
 
-	if (type == IEEE80211_FTYPE_DATA || type == IEEE80211_FTYPE_MGMT)
+	if (ieee80211_is_data(hdr->frame_control) || ieee80211_is_mgmt(hdr->frame_control))
 		local->dot11ReceivedFragmentCount++;
 
 	rx.sta = sta_info_get(local, hdr->addr2);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 9c60dedc368..96ffb4d8dfb 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -82,6 +82,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr,
 	struct ieee80211_rate *txrate;
 	struct ieee80211_local *local = tx->local;
 	struct ieee80211_supported_band *sband;
+	struct ieee80211_hdr *hdr;
 
 	sband = local->hw.wiphy->bands[tx->channel->band];
 	txrate = &sband->bitrates[tx->rate_idx];
@@ -107,8 +108,8 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr,
 	 *   at the highest possible rate belonging to the PHY rates in the
 	 *   BSSBasicRateSet
 	 */
-
-	if ((tx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_CTL) {
+	hdr = (struct ieee80211_hdr *)tx->skb->data;
+	if (ieee80211_is_ctl(hdr->frame_control)) {
 		/* TODO: These control frames are not currently sent by
 		 * 80211.o, but should they be implemented, this function
 		 * needs to be updated to support duration field calculation.
@@ -213,9 +214,8 @@ static int inline is_ieee80211_device(struct net_device *dev,
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 {
-#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
-#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	u32 sta_flags;
 
@@ -223,8 +223,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 		return TX_CONTINUE;
 
 	if (unlikely(tx->local->sta_sw_scanning) &&
-	    ((tx->fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT ||
-	     (tx->fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_PROBE_REQ))
+	    !ieee80211_is_probe_req(hdr->frame_control))
 		return TX_DROP;
 
 	if (tx->sdata->vif.type == IEEE80211_IF_TYPE_MESH_POINT)
@@ -238,7 +237,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 	if (likely(tx->flags & IEEE80211_TX_UNICAST)) {
 		if (unlikely(!(sta_flags & WLAN_STA_ASSOC) &&
 			     tx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
-			     (tx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA)) {
+			     ieee80211_is_data(hdr->frame_control))) {
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 			DECLARE_MAC_BUF(mac);
 			printk(KERN_DEBUG "%s: dropped data frame to not "
@@ -249,7 +248,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 			return TX_DROP;
 		}
 	} else {
-		if (unlikely((tx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA &&
+		if (unlikely(ieee80211_is_data(hdr->frame_control) &&
 			     tx->local->num_sta == 0 &&
 			     tx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS)) {
 			/*
@@ -315,6 +314,7 @@ static ieee80211_tx_result
 ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
 
 	/*
 	 * broadcast/multicast frame
@@ -329,7 +329,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
 		return TX_CONTINUE;
 
 	/* no buffering for ordered frames */
-	if (tx->fc & IEEE80211_FCTL_ORDER)
+	if (ieee80211_has_order(hdr->frame_control))
 		return TX_CONTINUE;
 
 	/* no stations in PS mode */
@@ -367,12 +367,11 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 {
 	struct sta_info *sta = tx->sta;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
 	u32 staflags;
 	DECLARE_MAC_BUF(mac);
 
-	if (unlikely(!sta ||
-		     ((tx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_MGMT &&
-		      (tx->fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_PROBE_RESP)))
+	if (unlikely(!sta || ieee80211_is_probe_resp(hdr->frame_control)))
 		return TX_CONTINUE;
 
 	staflags = get_sta_flags(sta);
@@ -437,7 +436,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	u16 fc = tx->fc;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
 
 	if (unlikely(tx->skb->do_not_encrypt))
 		tx->key = NULL;
@@ -454,22 +453,16 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		tx->key = NULL;
 
 	if (tx->key) {
-		u16 ftype, stype;
-
 		tx->key->tx_rx_count++;
 		/* TODO: add threshold stuff again */
 
 		switch (tx->key->conf.alg) {
 		case ALG_WEP:
-			ftype = fc & IEEE80211_FCTL_FTYPE;
-			stype = fc & IEEE80211_FCTL_STYPE;
-
-			if (ftype == IEEE80211_FTYPE_MGMT &&
-			    stype == IEEE80211_STYPE_AUTH)
+			if (ieee80211_is_auth(hdr->frame_control))
 				break;
 		case ALG_TKIP:
 		case ALG_CCMP:
-			if (!WLAN_FC_DATA_PRESENT(fc))
+			if (!ieee80211_is_data_present(hdr->frame_control))
 				tx->key = NULL;
 			break;
 		}
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 639998775b4..376c84987e4 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -291,9 +291,10 @@ u8 * ieee80211_wep_is_weak_iv(struct sk_buff *skb, struct ieee80211_key *key)
 ieee80211_rx_result
 ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
 {
-	if ((rx->fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA &&
-	    ((rx->fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT ||
-	     (rx->fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_AUTH))
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+
+	if (!ieee80211_is_data(hdr->frame_control) &&
+	    !ieee80211_is_auth(hdr->frame_control))
 		return RX_CONTINUE;
 
 	if (!(rx->status->flag & RX_FLAG_DECRYPTED)) {
-- 
cgit v1.2.3


From a4b7d7bda566acaa65fbf767f65a83b3a8dc74b9 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:14 -0700
Subject: mac80211: remove rx/tx_data->fc member

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h | 4 ++--
 net/mac80211/rx.c          | 8 ++------
 net/mac80211/tx.c          | 1 -
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a2870bc245d..978c3a03ea5 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -172,7 +172,7 @@ struct ieee80211_tx_data {
 	struct sk_buff **extra_frag;
 	int num_extra_frag;
 
-	u16 fc, ethertype;
+	u16 ethertype;
 	unsigned int flags;
 };
 
@@ -200,7 +200,7 @@ struct ieee80211_rx_data {
 	struct ieee80211_rx_status *status;
 	struct ieee80211_rate *rate;
 
-	u16 fc, ethertype;
+	u16 ethertype;
 	unsigned int flags;
 	int sent_ps_buffered;
 	int queue;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3a96251379f..2464263cb7a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1057,7 +1057,6 @@ ieee80211_rx_h_remove_qos_control(struct ieee80211_rx_data *rx)
 		ieee80211_hdrlen(hdr->frame_control) - IEEE80211_QOS_CTL_LEN);
 	hdr = (struct ieee80211_hdr *)skb_pull(rx->skb, IEEE80211_QOS_CTL_LEN);
 	/* change frame type to non QOS */
-	rx->fc &= ~IEEE80211_STYPE_QOS_DATA;
 	hdr->frame_control &= ~cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
 
 	return RX_CONTINUE;
@@ -1831,7 +1830,6 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 
 	rx.status = status;
 	rx.rate = rate;
-	rx.fc = le16_to_cpu(hdr->frame_control);
 
 	if (ieee80211_is_data(hdr->frame_control) || ieee80211_is_mgmt(hdr->frame_control))
 		local->dot11ReceivedFragmentCount++;
@@ -1894,14 +1892,12 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 				       prev->dev->name);
 			continue;
 		}
-		rx.fc = le16_to_cpu(hdr->frame_control);
 		ieee80211_invoke_rx_handlers(prev, &rx, skb_new);
 		prev = sdata;
 	}
-	if (prev) {
-		rx.fc = le16_to_cpu(hdr->frame_control);
+	if (prev)
 		ieee80211_invoke_rx_handlers(prev, &rx, skb);
-	} else
+	else
 		dev_kfree_skb(skb);
 }
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 96ffb4d8dfb..85f3ba85c13 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -993,7 +993,6 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 	hdr = (struct ieee80211_hdr *) skb->data;
 
 	tx->sta = sta_info_get(local, hdr->addr1);
-	tx->fc = le16_to_cpu(hdr->frame_control);
 
 	if (is_multicast_ether_addr(hdr->addr1)) {
 		tx->flags &= ~IEEE80211_TX_UNICAST;
-- 
cgit v1.2.3


From 4eb2ae9a42b77de48ee9fecfaccc66c640313188 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 15 Jul 2008 18:44:15 -0700
Subject: mac80211: remove WLAN_FC_DATA_PRESENT

All users are gone now.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 978c3a03ea5..3cad0172b45 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -36,8 +36,6 @@
 #define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */
 #endif /* ETH_P_PAE */
 
-#define WLAN_FC_DATA_PRESENT(fc) (((fc) & 0x4c) == 0x08)
-
 struct ieee80211_local;
 
 /* Maximum number of broadcast/multicast frames to buffer when some of the
-- 
cgit v1.2.3


From bdbe819540f3365249095692118dbfeee308140d Mon Sep 17 00:00:00 2001
From: Luis Carlos Cobo <luisca@cozybit.com>
Date: Thu, 14 Aug 2008 10:40:48 -0700
Subject: mac80211: allow no mac address until firmware load

Originally by Johannes Berg. This patch adds support for devices that do not
report their MAC address until the firmware is loaded. While the address is not
known, a multicast on is used.

Signed-off-by: Luis Carlos Cobo <luisca@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index f5537f90dd3..93dcdc27254 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -187,9 +187,15 @@ static int ieee80211_open(struct net_device *dev)
 	u32 changed = 0;
 	int res;
 	bool need_hw_reconfig = 0;
+	u8 null_addr[ETH_ALEN] = {0};
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
+	/* fail early if user set an invalid address */
+	if (compare_ether_addr(dev->dev_addr, null_addr) &&
+	    !is_valid_ether_addr(dev->dev_addr))
+		return -EADDRNOTAVAIL;
+
 	/* we hold the RTNL here so can safely walk the list */
 	list_for_each_entry(nsdata, &local->interfaces, list) {
 		struct net_device *ndev = nsdata->dev;
@@ -270,6 +276,36 @@ static int ieee80211_open(struct net_device *dev)
 		ieee80211_led_radio(local, local->hw.conf.radio_enabled);
 	}
 
+	/*
+	 * Check all interfaces and copy the hopefully now-present
+	 * MAC address to those that have the special null one.
+	 */
+	list_for_each_entry(nsdata, &local->interfaces, list) {
+		struct net_device *ndev = nsdata->dev;
+
+		/*
+		 * No need to check netif_running since we do not allow
+		 * it to start up with this invalid address.
+		 */
+		if (compare_ether_addr(null_addr, ndev->dev_addr) == 0)
+			memcpy(ndev->dev_addr,
+			       local->hw.wiphy->perm_addr,
+			       ETH_ALEN);
+	}
+
+	if (compare_ether_addr(null_addr, local->mdev->dev_addr) == 0)
+		memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr,
+		       ETH_ALEN);
+
+	/*
+	 * Validate the MAC address for this device.
+	 */
+	if (!is_valid_ether_addr(dev->dev_addr)) {
+		if (!local->open_count && local->ops->stop)
+			local->ops->stop(local_to_hw(local));
+		return -EADDRNOTAVAIL;
+	}
+
 	switch (sdata->vif.type) {
 	case IEEE80211_IF_TYPE_VLAN:
 		/* no need to tell driver */
@@ -975,6 +1011,8 @@ void ieee80211_if_setup(struct net_device *dev)
 	dev->open = ieee80211_open;
 	dev->stop = ieee80211_stop;
 	dev->destructor = free_netdev;
+	/* we will validate the address ourselves in ->open */
+	dev->validate_addr = NULL;
 }
 
 /* everything else */
-- 
cgit v1.2.3


From 02589f60510030a3c1496e7a8c511e4f674ef5ff Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Sat, 2 Aug 2008 15:10:57 -0300
Subject: rfkill: detect bogus double-registering (v2)

Detect and abort with -EEXIST if rfkill_register is called twice on the
same rfkill struct.  And WARN_ON(it) for good measure.

While at it, flag when we are adding the first switch of a type, we will
need that information later.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 35a9994e233..1f23de20a85 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -525,17 +525,44 @@ static struct class rfkill_class = {
 	.dev_uevent	= rfkill_dev_uevent,
 };
 
+static int rfkill_check_duplicity(const struct rfkill *rfkill)
+{
+	struct rfkill *p;
+	unsigned long seen[BITS_TO_LONGS(RFKILL_TYPE_MAX)];
+
+	memset(seen, 0, sizeof(seen));
+
+	list_for_each_entry(p, &rfkill_list, node) {
+		if (p == rfkill) {
+			WARN_ON(1);
+			return -EEXIST;
+		}
+		set_bit(p->type, seen);
+	}
+
+	/* 0: first switch of its kind */
+	return test_bit(rfkill->type, seen);
+}
+
 static int rfkill_add_switch(struct rfkill *rfkill)
 {
+	int error;
+
 	mutex_lock(&rfkill_mutex);
 
+	error = rfkill_check_duplicity(rfkill);
+	if (error < 0)
+		goto unlock_out;
+
 	rfkill_toggle_radio(rfkill, rfkill_states[rfkill->type], 0);
 
 	list_add_tail(&rfkill->node, &rfkill_list);
 
+	error = 0;
+unlock_out:
 	mutex_unlock(&rfkill_mutex);
 
-	return 0;
+	return error;
 }
 
 static void rfkill_remove_switch(struct rfkill *rfkill)
-- 
cgit v1.2.3


From 9961920199ec88d6b581d3e38502088935925c04 Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Sat, 2 Aug 2008 15:10:58 -0300
Subject: rfkill: add default global states (v2)

Add a second set of global states, "rfkill_default_states", to track the
state that will be used when the first rfkill class of a given type is
registered, and also to save "undo" information when rfkill_epo is called.

Add a new exported function, rfkill_set_default(), which can be used by
platform drivers to restore radio state saved by the platform across
reboots or shutdown.

Also, fix rfkill_epo to properly update rfkill_states, but still preserve a
copy of the state so that we can undo the effect of rfkill_epo later if we
want to.  Add rfkill_restore_states() to restore rfkill_states from the
copy.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill-input.h |   1 +
 net/rfkill/rfkill.c       | 127 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 116 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill-input.h b/net/rfkill/rfkill-input.h
index f63d0504568..bbfa646157c 100644
--- a/net/rfkill/rfkill-input.h
+++ b/net/rfkill/rfkill-input.h
@@ -13,5 +13,6 @@
 
 void rfkill_switch_all(enum rfkill_type type, enum rfkill_state state);
 void rfkill_epo(void);
+void rfkill_restore_states(void);
 
 #endif /* __RFKILL_INPUT_H */
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 1f23de20a85..b995fa32cf8 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -44,7 +44,13 @@ module_param_named(default_state, rfkill_default_state, uint, 0444);
 MODULE_PARM_DESC(default_state,
 		 "Default initial state for all radio types, 0 = radio off");
 
-static enum rfkill_state rfkill_states[RFKILL_TYPE_MAX];
+struct rfkill_gsw_state {
+	enum rfkill_state current_state;
+	enum rfkill_state default_state;
+};
+
+static struct rfkill_gsw_state rfkill_global_states[RFKILL_TYPE_MAX];
+static unsigned long rfkill_states_lockdflt[BITS_TO_LONGS(RFKILL_TYPE_MAX)];
 
 static BLOCKING_NOTIFIER_HEAD(rfkill_notifier_list);
 
@@ -213,22 +219,22 @@ static int rfkill_toggle_radio(struct rfkill *rfkill,
 }
 
 /**
- * rfkill_switch_all - Toggle state of all switches of given type
+ * __rfkill_switch_all - Toggle state of all switches of given type
  * @type: type of interfaces to be affected
  * @state: the new state
  *
  * This function toggles the state of all switches of given type,
  * unless a specific switch is claimed by userspace (in which case,
  * that switch is left alone) or suspended.
+ *
+ * Caller must have acquired rfkill_mutex.
  */
-void rfkill_switch_all(enum rfkill_type type, enum rfkill_state state)
+static void __rfkill_switch_all(const enum rfkill_type type,
+				const enum rfkill_state state)
 {
 	struct rfkill *rfkill;
 
-	mutex_lock(&rfkill_mutex);
-
-	rfkill_states[type] = state;
-
+	rfkill_global_states[type].current_state = state;
 	list_for_each_entry(rfkill, &rfkill_list, node) {
 		if ((!rfkill->user_claim) && (rfkill->type == type)) {
 			mutex_lock(&rfkill->mutex);
@@ -236,7 +242,20 @@ void rfkill_switch_all(enum rfkill_type type, enum rfkill_state state)
 			mutex_unlock(&rfkill->mutex);
 		}
 	}
+}
 
+/**
+ * rfkill_switch_all - Toggle state of all switches of given type
+ * @type: type of interfaces to be affected
+ * @state: the new state
+ *
+ * Acquires rfkill_mutex and calls __rfkill_switch_all(@type, @state).
+ * Please refer to __rfkill_switch_all() for details.
+ */
+void rfkill_switch_all(enum rfkill_type type, enum rfkill_state state)
+{
+	mutex_lock(&rfkill_mutex);
+	__rfkill_switch_all(type, state);
 	mutex_unlock(&rfkill_mutex);
 }
 EXPORT_SYMBOL(rfkill_switch_all);
@@ -246,10 +265,14 @@ EXPORT_SYMBOL(rfkill_switch_all);
  *
  * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED,
  * ignoring everything in its path but rfkill_mutex and rfkill->mutex.
+ *
+ * The global state before the EPO is saved and can be restored later
+ * using rfkill_restore_states().
  */
 void rfkill_epo(void)
 {
 	struct rfkill *rfkill;
+	int i;
 
 	mutex_lock(&rfkill_mutex);
 	list_for_each_entry(rfkill, &rfkill_list, node) {
@@ -257,10 +280,34 @@ void rfkill_epo(void)
 		rfkill_toggle_radio(rfkill, RFKILL_STATE_SOFT_BLOCKED, 1);
 		mutex_unlock(&rfkill->mutex);
 	}
+	for (i = 0; i < RFKILL_TYPE_MAX; i++) {
+		rfkill_global_states[i].default_state =
+				rfkill_global_states[i].current_state;
+		rfkill_global_states[i].current_state =
+				RFKILL_STATE_SOFT_BLOCKED;
+	}
 	mutex_unlock(&rfkill_mutex);
 }
 EXPORT_SYMBOL_GPL(rfkill_epo);
 
+/**
+ * rfkill_restore_states - restore global states
+ *
+ * Restore (and sync switches to) the global state from the
+ * states in rfkill_default_states.  This can undo the effects of
+ * a call to rfkill_epo().
+ */
+void rfkill_restore_states(void)
+{
+	int i;
+
+	mutex_lock(&rfkill_mutex);
+	for (i = 0; i < RFKILL_TYPE_MAX; i++)
+		__rfkill_switch_all(i, rfkill_global_states[i].default_state);
+	mutex_unlock(&rfkill_mutex);
+}
+EXPORT_SYMBOL_GPL(rfkill_restore_states);
+
 /**
  * rfkill_force_state - Force the internal rfkill radio state
  * @rfkill: pointer to the rfkill class to modify.
@@ -406,8 +453,8 @@ static ssize_t rfkill_claim_store(struct device *dev,
 		if (!claim) {
 			mutex_lock(&rfkill->mutex);
 			rfkill_toggle_radio(rfkill,
-					    rfkill_states[rfkill->type],
-					    0);
+					rfkill_global_states[rfkill->type].current_state,
+					0);
 			mutex_unlock(&rfkill->mutex);
 		}
 		rfkill->user_claim = claim;
@@ -554,7 +601,16 @@ static int rfkill_add_switch(struct rfkill *rfkill)
 	if (error < 0)
 		goto unlock_out;
 
-	rfkill_toggle_radio(rfkill, rfkill_states[rfkill->type], 0);
+	if (!error) {
+		/* lock default after first use */
+		set_bit(rfkill->type, rfkill_states_lockdflt);
+		rfkill_global_states[rfkill->type].current_state =
+			rfkill_global_states[rfkill->type].default_state;
+	}
+
+	rfkill_toggle_radio(rfkill,
+			    rfkill_global_states[rfkill->type].current_state,
+			    0);
 
 	list_add_tail(&rfkill->node, &rfkill_list);
 
@@ -710,6 +766,53 @@ void rfkill_unregister(struct rfkill *rfkill)
 }
 EXPORT_SYMBOL(rfkill_unregister);
 
+/**
+ * rfkill_set_default - set initial value for a switch type
+ * @type - the type of switch to set the default state of
+ * @state - the new default state for that group of switches
+ *
+ * Sets the initial state rfkill should use for a given type.
+ * The following initial states are allowed: RFKILL_STATE_SOFT_BLOCKED
+ * and RFKILL_STATE_UNBLOCKED.
+ *
+ * This function is meant to be used by platform drivers for platforms
+ * that can save switch state across power down/reboot.
+ *
+ * The default state for each switch type can be changed exactly once.
+ * After a switch of that type is registered, the default state cannot
+ * be changed anymore.  This guards against multiple drivers it the
+ * same platform trying to set the initial switch default state, which
+ * is not allowed.
+ *
+ * Returns -EPERM if the state has already been set once or is in use,
+ * so drivers likely want to either ignore or at most printk(KERN_NOTICE)
+ * if this function returns -EPERM.
+ *
+ * Returns 0 if the new default state was set, or an error if it
+ * could not be set.
+ */
+int rfkill_set_default(enum rfkill_type type, enum rfkill_state state)
+{
+	int error;
+
+	if (type >= RFKILL_TYPE_MAX ||
+	    (state != RFKILL_STATE_SOFT_BLOCKED &&
+	     state != RFKILL_STATE_UNBLOCKED))
+		return -EINVAL;
+
+	mutex_lock(&rfkill_mutex);
+
+	if (!test_and_set_bit(type, rfkill_states_lockdflt)) {
+		rfkill_global_states[type].default_state = state;
+		error = 0;
+	} else
+		error = -EPERM;
+
+	mutex_unlock(&rfkill_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(rfkill_set_default);
+
 /*
  * Rfkill module initialization/deinitialization.
  */
@@ -723,8 +826,8 @@ static int __init rfkill_init(void)
 	    rfkill_default_state != RFKILL_STATE_UNBLOCKED)
 		return -EINVAL;
 
-	for (i = 0; i < ARRAY_SIZE(rfkill_states); i++)
-		rfkill_states[i] = rfkill_default_state;
+	for (i = 0; i < RFKILL_TYPE_MAX; i++)
+		rfkill_global_states[i].default_state = rfkill_default_state;
 
 	error = class_register(&rfkill_class);
 	if (error) {
-- 
cgit v1.2.3


From 77fba13ccc3a2a3db100892a4a6cc5e2f8290cc7 Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Sat, 2 Aug 2008 15:10:59 -0300
Subject: rfkill: add __must_check annotations

rfkill is not a small, mere detail in wireless support.  Once it starts
supporting rfkill and users start counting on that support, a wireless
device is at risk of operating in dangerous conditions should rfkill
support fail to properly activate.

Therefore, add the required __must_check annotations on some key functions
of the rfkill API, for which the wireless drivers absolutely MUST handle
the failure mode safely in order to avoid a potentially dangerous situation
where the wireless transmitter is left enabled when the user don't want it
to.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index b995fa32cf8..fae7ffade9c 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -645,7 +645,8 @@ static void rfkill_remove_switch(struct rfkill *rfkill)
  * NOTE: If registration fails the structure shoudl be freed by calling
  * rfkill_free() otherwise rfkill_unregister() should be used.
  */
-struct rfkill *rfkill_allocate(struct device *parent, enum rfkill_type type)
+struct rfkill * __must_check rfkill_allocate(struct device *parent,
+					     enum rfkill_type type)
 {
 	struct rfkill *rfkill;
 	struct device *dev;
@@ -716,7 +717,7 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
  * structure needs to be registered. Immediately from registration the
  * switch driver should be able to service calls to toggle_radio.
  */
-int rfkill_register(struct rfkill *rfkill)
+int __must_check rfkill_register(struct rfkill *rfkill)
 {
 	static atomic_t rfkill_no = ATOMIC_INIT(0);
 	struct device *dev = &rfkill->dev;
-- 
cgit v1.2.3


From 96c87607ac8f9b0e641d11ba6e57f8ec0214ea1c Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Sat, 2 Aug 2008 15:11:00 -0300
Subject: rfkill: introduce RFKILL_STATE_MAX

While it is interesting to not add last-enum-markers because it allows gcc
to warn us of switch() statements missing a valid state, we really should
be handling memory corruption on a rfkill state with default clauses,
anyway.

So add RFKILL_STATE_MAX and use it where applicable.  It makes for safer
code in the long run.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index fae7ffade9c..47e0b2d232e 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -201,6 +201,8 @@ static int rfkill_toggle_radio(struct rfkill *rfkill,
 		 * BLOCK even a transmitter that is already in state
 		 * RFKILL_STATE_HARD_BLOCKED */
 		break;
+	default:
+		return -EINVAL;
 	}
 
 	if (force || state != rfkill->state) {
@@ -234,6 +236,9 @@ static void __rfkill_switch_all(const enum rfkill_type type,
 {
 	struct rfkill *rfkill;
 
+	if (unlikely(state >= RFKILL_STATE_MAX))
+		return;
+
 	rfkill_global_states[type].current_state = state;
 	list_for_each_entry(rfkill, &rfkill_list, node) {
 		if ((!rfkill->user_claim) && (rfkill->type == type)) {
@@ -329,9 +334,7 @@ int rfkill_force_state(struct rfkill *rfkill, enum rfkill_state state)
 {
 	enum rfkill_state oldstate;
 
-	if (state != RFKILL_STATE_SOFT_BLOCKED &&
-	    state != RFKILL_STATE_UNBLOCKED &&
-	    state != RFKILL_STATE_HARD_BLOCKED)
+	if (unlikely(state >= RFKILL_STATE_MAX))
 		return -EINVAL;
 
 	mutex_lock(&rfkill->mutex);
@@ -727,6 +730,8 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 		return -EINVAL;
 	if (rfkill->type >= RFKILL_TYPE_MAX)
 		return -EINVAL;
+	if (rfkill->state >= RFKILL_STATE_MAX)
+		return -EINVAL;
 
 	snprintf(dev->bus_id, sizeof(dev->bus_id),
 		 "rfkill%ld", (long)atomic_inc_return(&rfkill_no) - 1);
-- 
cgit v1.2.3


From fef1643bf0cdd092a52dc3378479e4811fd65152 Mon Sep 17 00:00:00 2001
From: Jasper Bryant-Greene <jasper@amiton.co.nz>
Date: Sun, 3 Aug 2008 11:30:55 +1200
Subject: move ETH_P_PAE from ieee80211_i.h to if_ether.h

ETH_P_PAE belongs in if_ether.h with the other ETH_P_* definitions. This
patch moves it there.

Signed-off-by: Jasper Bryant-Greene <jasper@amiton.co.nz>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 3cad0172b45..168f845c173 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -32,10 +32,6 @@
 /* ieee80211.o internal definitions, etc. These are not included into
  * low-level drivers. */
 
-#ifndef ETH_P_PAE
-#define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */
-#endif /* ETH_P_PAE */
-
 struct ieee80211_local;
 
 /* Maximum number of broadcast/multicast frames to buffer when some of the
-- 
cgit v1.2.3


From f698d856f65c3fea091cc303a135967965c5b880 Mon Sep 17 00:00:00 2001
From: Jasper Bryant-Greene <jasper@amiton.co.nz>
Date: Sun, 3 Aug 2008 12:04:37 +1200
Subject: replace net_device arguments with ieee80211_{local,sub_if_data} as
 appropriate

This patch replaces net_device arguments to mac80211 internal functions
with ieee80211_{local,sub_if_data} as appropriate.

It also does the same for many 802.11s mesh functions, and changes the
mesh path table to be indexed on sub_if_data rather than net_device.

If the mesh part needs to be a separate patch let me know, but since
mesh uses a lot of mac80211 functions which were being converted anyway,
the changes go hand-in-hand somewhat.

This patch probably does not convert all the functions which could be
converted, but it is a large chunk and followup patches will be
provided.

Signed-off-by: Jasper Bryant-Greene <jasper@amiton.co.nz>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c          |  21 +-
 net/mac80211/debugfs_sta.c  |   2 +-
 net/mac80211/event.c        |   5 +-
 net/mac80211/ieee80211_i.h  |  48 ++--
 net/mac80211/iface.c        |   8 +-
 net/mac80211/main.c         |  16 +-
 net/mac80211/mesh.c         |  29 +-
 net/mac80211/mesh.h         |  61 +++--
 net/mac80211/mesh_hwmp.c    | 112 ++++----
 net/mac80211/mesh_pathtbl.c |  76 +++---
 net/mac80211/mesh_plink.c   |  52 ++--
 net/mac80211/mlme.c         | 652 ++++++++++++++++++++------------------------
 net/mac80211/rx.c           |  16 +-
 net/mac80211/tx.c           |   4 +-
 net/mac80211/wext.c         |  49 ++--
 net/mac80211/wpa.c          |   2 +-
 16 files changed, 545 insertions(+), 608 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 297c257864c..6d2ad2bf3ab 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -66,13 +66,16 @@ static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
 static int ieee80211_del_iface(struct wiphy *wiphy, int ifindex)
 {
 	struct net_device *dev;
+	struct ieee80211_sub_if_data *sdata;
 
 	/* we're under RTNL */
 	dev = __dev_get_by_index(&init_net, ifindex);
 	if (!dev)
 		return -ENODEV;
 
-	ieee80211_if_remove(dev);
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	ieee80211_if_remove(sdata);
 
 	return 0;
 }
@@ -842,13 +845,13 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev,
 		return -ENOENT;
 	}
 
-	err = mesh_path_add(dst, dev);
+	err = mesh_path_add(dst, sdata);
 	if (err) {
 		rcu_read_unlock();
 		return err;
 	}
 
-	mpath = mesh_path_lookup(dst, dev);
+	mpath = mesh_path_lookup(dst, sdata);
 	if (!mpath) {
 		rcu_read_unlock();
 		return -ENXIO;
@@ -862,10 +865,12 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev,
 static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev,
 				 u8 *dst)
 {
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
 	if (dst)
-		return mesh_path_del(dst, dev);
+		return mesh_path_del(dst, sdata);
 
-	mesh_path_flush(dev);
+	mesh_path_flush(sdata);
 	return 0;
 }
 
@@ -897,7 +902,7 @@ static int ieee80211_change_mpath(struct wiphy *wiphy,
 		return -ENOENT;
 	}
 
-	mpath = mesh_path_lookup(dst, dev);
+	mpath = mesh_path_lookup(dst, sdata);
 	if (!mpath) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -965,7 +970,7 @@ static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
 		return -ENOTSUPP;
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup(dst, dev);
+	mpath = mesh_path_lookup(dst, sdata);
 	if (!mpath) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -993,7 +998,7 @@ static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev,
 		return -ENOTSUPP;
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup_by_idx(idx, dev);
+	mpath = mesh_path_lookup_by_idx(idx, sdata);
 	if (!mpath) {
 		rcu_read_unlock();
 		return -ENOENT;
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 79a062782d5..6abe5427752 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -201,7 +201,7 @@ static ssize_t sta_agg_status_write(struct file *file,
 		tid_num = tid_num - 100;
 		if (tid_static_rx[tid_num] == 1) {
 			strcpy(state, "off ");
-			ieee80211_sta_stop_rx_ba_session(dev, da, tid_num, 0,
+			ieee80211_sta_stop_rx_ba_session(sta->sdata, da, tid_num, 0,
 					WLAN_REASON_QSTA_REQUIRE_SETUP);
 			sta->ampdu_mlme.tid_state_rx[tid_num] |=
 					HT_AGG_STATE_DEBUGFS_CTL;
diff --git a/net/mac80211/event.c b/net/mac80211/event.c
index 2280f40b456..8de60de70bc 100644
--- a/net/mac80211/event.c
+++ b/net/mac80211/event.c
@@ -8,7 +8,6 @@
  * mac80211 - events
  */
 
-#include <linux/netdevice.h>
 #include <net/iw_handler.h>
 #include "ieee80211_i.h"
 
@@ -17,7 +16,7 @@
  * (in the variable hdr) must be long enough to extract the TKIP
  * fields like TSC
  */
-void mac80211_ev_michael_mic_failure(struct net_device *dev, int keyidx,
+void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
 				     struct ieee80211_hdr *hdr)
 {
 	union iwreq_data wrqu;
@@ -32,7 +31,7 @@ void mac80211_ev_michael_mic_failure(struct net_device *dev, int keyidx,
 			print_mac(mac, hdr->addr2));
 		memset(&wrqu, 0, sizeof(wrqu));
 		wrqu.data.length = strlen(buf);
-		wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
+		wireless_send_event(sdata->dev, IWEVCUSTOM, &wrqu, buf);
 		kfree(buf);
 	}
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 168f845c173..b5d3f58a784 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -851,65 +851,65 @@ u32 ieee80211_handle_ht(struct ieee80211_local *local, int enable_ht,
 
 /* ieee80211_ioctl.c */
 extern const struct iw_handler_def ieee80211_iw_handler_def;
-int ieee80211_set_freq(struct net_device *dev, int freq);
+int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freq);
 
 /* ieee80211_sta.c */
 void ieee80211_sta_timer(unsigned long data);
 void ieee80211_sta_work(struct work_struct *work);
 void ieee80211_sta_scan_work(struct work_struct *work);
-void ieee80211_sta_rx_mgmt(struct net_device *dev, struct sk_buff *skb,
+void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 			   struct ieee80211_rx_status *rx_status);
-int ieee80211_sta_set_ssid(struct net_device *dev, char *ssid, size_t len);
-int ieee80211_sta_get_ssid(struct net_device *dev, char *ssid, size_t *len);
-int ieee80211_sta_set_bssid(struct net_device *dev, u8 *bssid);
-int ieee80211_sta_req_scan(struct net_device *dev, u8 *ssid, size_t ssid_len);
-void ieee80211_sta_req_auth(struct net_device *dev,
+int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
+int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
+int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
+int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len);
+void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
 			    struct ieee80211_if_sta *ifsta);
-int ieee80211_sta_scan_results(struct net_device *dev,
+int ieee80211_sta_scan_results(struct ieee80211_local *local,
 			       struct iw_request_info *info,
 			       char *buf, size_t len);
 ieee80211_rx_result ieee80211_sta_rx_scan(
-	struct net_device *dev, struct sk_buff *skb,
+	struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 	struct ieee80211_rx_status *rx_status);
 void ieee80211_rx_bss_list_init(struct ieee80211_local *local);
 void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local);
-int ieee80211_sta_set_extra_ie(struct net_device *dev, char *ie, size_t len);
-struct sta_info *ieee80211_ibss_add_sta(struct net_device *dev,
+int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len);
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 					struct sk_buff *skb, u8 *bssid,
 					u8 *addr, u64 supp_rates);
-int ieee80211_sta_deauthenticate(struct net_device *dev, u16 reason);
-int ieee80211_sta_disassociate(struct net_device *dev, u16 reason);
+int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason);
+int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason);
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u32 changed);
-u32 ieee80211_reset_erp_info(struct net_device *dev);
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
 int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
 				   struct ieee80211_ht_info *ht_info);
 int ieee80211_ht_addt_info_ie_to_ht_bss_info(
 			struct ieee80211_ht_addt_info *ht_add_info_ie,
 			struct ieee80211_ht_bss_info *bss_info);
-void ieee80211_send_addba_request(struct net_device *dev, const u8 *da,
+void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da,
 				  u16 tid, u8 dialog_token, u16 start_seq_num,
 				  u16 agg_size, u16 timeout);
-void ieee80211_send_delba(struct net_device *dev, const u8 *da, u16 tid,
+void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid,
 				u16 initiator, u16 reason_code);
-void ieee80211_send_bar(struct net_device *dev, u8 *ra, u16 tid, u16 ssn);
+void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn);
 
-void ieee80211_sta_stop_rx_ba_session(struct net_device *dev, u8 *da,
+void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *da,
 				u16 tid, u16 initiator, u16 reason);
 void sta_addba_resp_timer_expired(unsigned long data);
-void ieee80211_sta_tear_down_BA_sessions(struct net_device *dev, u8 *addr);
+void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr);
 u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 			    struct ieee802_11_elems *elems,
 			    enum ieee80211_band band);
-void ieee80211_sta_tx(struct net_device *dev, struct sk_buff *skb,
+void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		int encrypt);
 void ieee802_11_parse_elems(u8 *start, size_t len,
 				   struct ieee802_11_elems *elems);
 
 #ifdef CONFIG_MAC80211_MESH
-void ieee80211_start_mesh(struct net_device *dev);
+void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
 #else
-static inline void ieee80211_start_mesh(struct net_device *dev)
+static inline void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 {}
 #endif
 
@@ -920,7 +920,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 		     struct vif_params *params);
 int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
 			     enum ieee80211_if_types type);
-void ieee80211_if_remove(struct net_device *dev);
+void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
 void ieee80211_remove_interfaces(struct ieee80211_local *local);
 
 /* tx handling */
@@ -938,7 +938,7 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
 			enum ieee80211_if_types type);
 int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
 			     int rate, int erp, int short_preamble);
-void mac80211_ev_michael_mic_failure(struct net_device *dev, int keyidx,
+void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
 				     struct ieee80211_hdr *hdr);
 
 #ifdef CONFIG_MAC80211_NOINLINE
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 610ed1d9893..4a623b8e91f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -56,7 +56,7 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 	case IEEE80211_IF_TYPE_MESH_POINT:
 		/* Allow compiler to elide mesh_rmc_free call. */
 		if (ieee80211_vif_is_mesh(&sdata->vif))
-			mesh_rmc_free(dev);
+			mesh_rmc_free(sdata);
 		/* fall through */
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
@@ -241,15 +241,13 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 	return ret;
 }
 
-void ieee80211_if_remove(struct net_device *dev)
+void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
 	ASSERT_RTNL();
 
 	list_del_rcu(&sdata->list);
 	synchronize_rcu();
-	unregister_netdevice(dev);
+	unregister_netdevice(sdata->dev);
 }
 
 /*
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 93dcdc27254..81963948665 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -347,8 +347,8 @@ static int ieee80211_open(struct net_device *dev)
 			goto err_stop;
 
 		if (ieee80211_vif_is_mesh(&sdata->vif))
-			ieee80211_start_mesh(sdata->dev);
-		changed |= ieee80211_reset_erp_info(dev);
+			ieee80211_start_mesh(sdata);
+		changed |= ieee80211_reset_erp_info(sdata);
 		ieee80211_bss_info_change_notify(sdata, changed);
 		ieee80211_enable_keys(sdata);
 
@@ -448,7 +448,7 @@ static int ieee80211_stop(struct net_device *dev)
 
 	list_for_each_entry_rcu(sta, &local->sta_list, list) {
 		if (sta->sdata == sdata)
-			ieee80211_sta_tear_down_BA_sessions(dev, sta->addr);
+			ieee80211_sta_tear_down_BA_sessions(sdata, sta->addr);
 	}
 
 	rcu_read_unlock();
@@ -706,7 +706,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
 
 
-	ieee80211_send_addba_request(sta->sdata->dev, ra, tid,
+	ieee80211_send_addba_request(sta->sdata, ra, tid,
 			 sta->ampdu_mlme.tid_tx[tid]->dialog_token,
 			 sta->ampdu_mlme.tid_tx[tid]->ssn,
 			 0x40, 5000);
@@ -889,7 +889,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
 	}
 
 	if (*state & HT_AGG_STATE_INITIATOR_MSK)
-		ieee80211_send_delba(sta->sdata->dev, ra, tid,
+		ieee80211_send_delba(sta->sdata, ra, tid,
 			WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
 
 	agg_queue = sta->tid_to_tx_q[tid];
@@ -1200,10 +1200,8 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 					     changed);
 }
 
-u32 ieee80211_reset_erp_info(struct net_device *dev)
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
 	sdata->bss_conf.use_cts_prot = 0;
 	sdata->bss_conf.use_short_preamble = 0;
 	return BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE;
@@ -1438,7 +1436,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 			tid = qc[0] & 0xf;
 			ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10)
 						& IEEE80211_SCTL_SEQ);
-			ieee80211_send_bar(sta->sdata->dev, hdr->addr1,
+			ieee80211_send_bar(sta->sdata, hdr->addr1,
 					   tid, ssn);
 		}
 	}
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index b5933b27149..b631703bcc8 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -39,14 +39,13 @@ void ieee80211s_stop(void)
  * mesh_matches_local - check if the config of a mesh point matches ours
  *
  * @ie: information elements of a management frame from the mesh peer
- * @dev: local mesh interface
+ * @sdata: local mesh subif
  *
  * This function checks if the mesh configuration of a mesh point matches the
  * local mesh configuration, i.e. if both nodes belong to the same mesh network.
  */
-bool mesh_matches_local(struct ieee802_11_elems *ie, struct net_device *dev)
+bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *sta = &sdata->u.sta;
 
 	/*
@@ -73,10 +72,8 @@ bool mesh_matches_local(struct ieee802_11_elems *ie, struct net_device *dev)
  * mesh_peer_accepts_plinks - check if an mp is willing to establish peer links
  *
  * @ie: information elements of a management frame from the mesh peer
- * @dev: local mesh interface
  */
-bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie,
-			      struct net_device *dev)
+bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie)
 {
 	return (*(ie->mesh_config + CAPAB_OFFSET) & ACCEPT_PLINKS) != 0;
 }
@@ -111,9 +108,8 @@ void mesh_ids_set_default(struct ieee80211_if_sta *sta)
 	memcpy(sta->mesh_cc_id, def_id, 4);
 }
 
-int mesh_rmc_init(struct net_device *dev)
+int mesh_rmc_init(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	int i;
 
 	sdata->u.sta.rmc = kmalloc(sizeof(struct mesh_rmc), GFP_KERNEL);
@@ -125,9 +121,8 @@ int mesh_rmc_init(struct net_device *dev)
 	return 0;
 }
 
-void mesh_rmc_free(struct net_device *dev)
+void mesh_rmc_free(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct mesh_rmc *rmc = sdata->u.sta.rmc;
 	struct rmc_entry *p, *n;
 	int i;
@@ -158,9 +153,8 @@ void mesh_rmc_free(struct net_device *dev)
  * it.
  */
 int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr,
-		   struct net_device *dev)
+		   struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct mesh_rmc *rmc = sdata->u.sta.rmc;
 	u32 seqnum = 0;
 	int entries = 0;
@@ -194,10 +188,9 @@ int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr,
 	return 0;
 }
 
-void mesh_mgmt_ies_add(struct sk_buff *skb, struct net_device *dev)
+void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
 	u8 *pos;
 	int len, i, rate;
@@ -262,10 +255,10 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct net_device *dev)
 	return;
 }
 
-u32 mesh_table_hash(u8 *addr, struct net_device *dev, struct mesh_table *tbl)
+u32 mesh_table_hash(u8 *addr, struct ieee80211_sub_if_data *sdata, struct mesh_table *tbl)
 {
 	/* Use last four bytes of hw addr and interface index as hash index */
-	return jhash_2words(*(u32 *)(addr+2), dev->ifindex, tbl->hash_rnd)
+	return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex, tbl->hash_rnd)
 		& tbl->hash_mask;
 }
 
@@ -434,7 +427,7 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
 	ifsta->preq_id = 0;
 	ifsta->dsn = 0;
 	atomic_set(&ifsta->mpaths, 0);
-	mesh_rmc_init(sdata->dev);
+	mesh_rmc_init(sdata);
 	ifsta->last_preq = jiffies;
 	/* Allocate all mesh structures when creating the first mesh interface. */
 	if (!mesh_allocated)
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 7495fbb0d21..84ff5d828fd 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -47,7 +47,7 @@ enum mesh_path_flags {
  * struct mesh_path - mac80211 mesh path structure
  *
  * @dst: mesh path destination mac address
- * @dev: mesh path device
+ * @sdata: mesh subif
  * @next_hop: mesh neighbor to which frames for this destination will be
  * 	forwarded
  * @timer: mesh path discovery timer
@@ -64,14 +64,14 @@ enum mesh_path_flags {
  * @state_lock: mesh pat state lock
  *
  *
- * The combination of dst and dev is unique in the mesh path table. Since the
+ * The combination of dst and sdata is unique in the mesh path table. Since the
  * next_hop STA is only protected by RCU as well, deleting the STA must also
  * remove/substitute the mesh_path structure and wait until that is no longer
  * reachable before destroying the STA completely.
  */
 struct mesh_path {
 	u8 dst[ETH_ALEN];
-	struct net_device *dev;
+	struct ieee80211_sub_if_data *sdata;
 	struct sta_info *next_hop;
 	struct timer_list timer;
 	struct sk_buff_head frame_queue;
@@ -203,59 +203,66 @@ int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr);
 int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr,
 		struct ieee80211_sub_if_data *sdata);
 int mesh_rmc_check(u8 *addr, struct ieee80211s_hdr *mesh_hdr,
-		struct net_device *dev);
-bool mesh_matches_local(struct ieee802_11_elems *ie, struct net_device *dev);
+		struct ieee80211_sub_if_data *sdata);
+bool mesh_matches_local(struct ieee802_11_elems *ie,
+		struct ieee80211_sub_if_data *sdata);
 void mesh_ids_set_default(struct ieee80211_if_sta *sta);
-void mesh_mgmt_ies_add(struct sk_buff *skb, struct net_device *dev);
-void mesh_rmc_free(struct net_device *dev);
-int mesh_rmc_init(struct net_device *dev);
+void mesh_mgmt_ies_add(struct sk_buff *skb,
+		struct ieee80211_sub_if_data *sdata);
+void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
+int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
 void ieee80211s_init(void);
 void ieee80211s_stop(void);
 void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata);
 
 /* Mesh paths */
-int mesh_nexthop_lookup(struct sk_buff *skb, struct net_device *dev);
-void mesh_path_start_discovery(struct net_device *dev);
-struct mesh_path *mesh_path_lookup(u8 *dst, struct net_device *dev);
-struct mesh_path *mesh_path_lookup_by_idx(int idx, struct net_device *dev);
+int mesh_nexthop_lookup(struct sk_buff *skb,
+		struct ieee80211_sub_if_data *sdata);
+void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata);
+struct mesh_path *mesh_path_lookup(u8 *dst,
+		struct ieee80211_sub_if_data *sdata);
+struct mesh_path *mesh_path_lookup_by_idx(int idx,
+		struct ieee80211_sub_if_data *sdata);
 void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop);
-void mesh_path_expire(struct net_device *dev);
-void mesh_path_flush(struct net_device *dev);
-void mesh_rx_path_sel_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
-		size_t len);
-int mesh_path_add(u8 *dst, struct net_device *dev);
+void mesh_path_expire(struct ieee80211_sub_if_data *sdata);
+void mesh_path_flush(struct ieee80211_sub_if_data *sdata);
+void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
+		struct ieee80211_mgmt *mgmt, size_t len);
+int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata);
 /* Mesh plinks */
-void mesh_neighbour_update(u8 *hw_addr, u64 rates, struct net_device *dev,
-		bool add);
-bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie,
-			      struct net_device *dev);
+void mesh_neighbour_update(u8 *hw_addr, u64 rates,
+		struct ieee80211_sub_if_data *sdata, bool add);
+bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie);
 void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata);
 void mesh_plink_broken(struct sta_info *sta);
 void mesh_plink_deactivate(struct sta_info *sta);
 int mesh_plink_open(struct sta_info *sta);
 int mesh_plink_close(struct sta_info *sta);
 void mesh_plink_block(struct sta_info *sta);
-void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
-			 size_t len, struct ieee80211_rx_status *rx_status);
+void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
+			 struct ieee80211_mgmt *mgmt, size_t len,
+			 struct ieee80211_rx_status *rx_status);
 
 /* Private interfaces */
 /* Mesh tables */
 struct mesh_table *mesh_table_alloc(int size_order);
 void mesh_table_free(struct mesh_table *tbl, bool free_leafs);
 struct mesh_table *mesh_table_grow(struct mesh_table *tbl);
-u32 mesh_table_hash(u8 *addr, struct net_device *dev, struct mesh_table *tbl);
+u32 mesh_table_hash(u8 *addr, struct ieee80211_sub_if_data *sdata,
+		struct mesh_table *tbl);
 /* Mesh paths */
 int mesh_path_error_tx(u8 *dest, __le32 dest_dsn, u8 *ra,
-		struct net_device *dev);
+		struct ieee80211_sub_if_data *sdata);
 void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta);
 void mesh_path_flush_pending(struct mesh_path *mpath);
 void mesh_path_tx_pending(struct mesh_path *mpath);
 int mesh_pathtbl_init(void);
 void mesh_pathtbl_unregister(void);
-int mesh_path_del(u8 *addr, struct net_device *dev);
+int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata);
 void mesh_path_timer(unsigned long data);
 void mesh_path_flush_by_nexthop(struct sta_info *sta);
-void mesh_path_discard_frame(struct sk_buff *skb, struct net_device *dev);
+void mesh_path_discard_frame(struct sk_buff *skb,
+		struct ieee80211_sub_if_data *sdata);
 
 #ifdef CONFIG_MAC80211_MESH
 extern int mesh_allocated;
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 2cdbd522631..eeb0ce2d5d3 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -82,9 +82,9 @@ enum mpath_frame_type {
 static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 		u8 *orig_addr, __le32 orig_dsn, u8 dst_flags, u8 *dst,
 		__le32 dst_dsn, u8 *da, u8 hop_count, u8 ttl, __le32 lifetime,
-		__le32 metric, __le32 preq_id, struct net_device *dev)
+		__le32 metric, __le32 preq_id, struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
 	struct ieee80211_mgmt *mgmt;
 	u8 *pos;
@@ -103,7 +103,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 					  IEEE80211_STYPE_ACTION);
 
 	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	/* BSSID is left zeroed, wildcard value */
 	mgmt->u.action.category = MESH_PATH_SEL_CATEGORY;
 	mgmt->u.action.u.mesh_action.action_code = action;
@@ -149,7 +149,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 	pos += ETH_ALEN;
 	memcpy(pos, &dst_dsn, 4);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 	return 0;
 }
 
@@ -161,9 +161,9 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
  * @ra: node this frame is addressed to
  */
 int mesh_path_error_tx(u8 *dst, __le32 dst_dsn, u8 *ra,
-		struct net_device *dev)
+		struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
 	struct ieee80211_mgmt *mgmt;
 	u8 *pos;
@@ -182,7 +182,7 @@ int mesh_path_error_tx(u8 *dst, __le32 dst_dsn, u8 *ra,
 					  IEEE80211_STYPE_ACTION);
 
 	memcpy(mgmt->da, ra, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	/* BSSID is left zeroed, wildcard value */
 	mgmt->u.action.category = MESH_PATH_SEL_CATEGORY;
 	mgmt->u.action.u.mesh_action.action_code = MPATH_PERR;
@@ -198,7 +198,7 @@ int mesh_path_error_tx(u8 *dst, __le32 dst_dsn, u8 *ra,
 	pos += ETH_ALEN;
 	memcpy(pos, &dst_dsn, 4);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 	return 0;
 }
 
@@ -233,7 +233,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local,
 /**
  * hwmp_route_info_get - Update routing info to originator and transmitter
  *
- * @dev: local mesh interface
+ * @sdata: local mesh subif
  * @mgmt: mesh management frame
  * @hwmp_ie: hwmp information element (PREP or PREQ)
  *
@@ -246,11 +246,11 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local,
  * Notes: this function is the only place (besides user-provided info) where
  * path routing information is updated.
  */
-static u32 hwmp_route_info_get(struct net_device *dev,
+static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 			    struct ieee80211_mgmt *mgmt,
 			    u8 *hwmp_ie)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct mesh_path *mpath;
 	struct sta_info *sta;
 	bool fresh_info;
@@ -301,14 +301,14 @@ static u32 hwmp_route_info_get(struct net_device *dev,
 		new_metric = MAX_METRIC;
 	exp_time = TU_TO_EXP_TIME(orig_lifetime);
 
-	if (memcmp(orig_addr, dev->dev_addr, ETH_ALEN) == 0) {
+	if (memcmp(orig_addr, sdata->dev->dev_addr, ETH_ALEN) == 0) {
 		/* This MP is the originator, we are not interested in this
 		 * frame, except for updating transmitter's path info.
 		 */
 		process = false;
 		fresh_info = false;
 	} else {
-		mpath = mesh_path_lookup(orig_addr, dev);
+		mpath = mesh_path_lookup(orig_addr, sdata);
 		if (mpath) {
 			spin_lock_bh(&mpath->state_lock);
 			if (mpath->flags & MESH_PATH_FIXED)
@@ -324,8 +324,8 @@ static u32 hwmp_route_info_get(struct net_device *dev,
 				}
 			}
 		} else {
-			mesh_path_add(orig_addr, dev);
-			mpath = mesh_path_lookup(orig_addr, dev);
+			mesh_path_add(orig_addr, sdata);
+			mpath = mesh_path_lookup(orig_addr, sdata);
 			if (!mpath) {
 				rcu_read_unlock();
 				return 0;
@@ -357,7 +357,7 @@ static u32 hwmp_route_info_get(struct net_device *dev,
 	else {
 		fresh_info = true;
 
-		mpath = mesh_path_lookup(ta, dev);
+		mpath = mesh_path_lookup(ta, sdata);
 		if (mpath) {
 			spin_lock_bh(&mpath->state_lock);
 			if ((mpath->flags & MESH_PATH_FIXED) ||
@@ -365,8 +365,8 @@ static u32 hwmp_route_info_get(struct net_device *dev,
 					(last_hop_metric > mpath->metric)))
 				fresh_info = false;
 		} else {
-			mesh_path_add(ta, dev);
-			mpath = mesh_path_lookup(ta, dev);
+			mesh_path_add(ta, sdata);
+			mpath = mesh_path_lookup(ta, sdata);
 			if (!mpath) {
 				rcu_read_unlock();
 				return 0;
@@ -392,10 +392,9 @@ static u32 hwmp_route_info_get(struct net_device *dev,
 	return process ? new_metric : 0;
 }
 
-static void hwmp_preq_frame_process(struct net_device *dev,
+static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_mgmt *mgmt,
 				    u8 *preq_elem, u32 metric) {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct mesh_path *mpath;
 	u8 *dst_addr, *orig_addr;
@@ -411,7 +410,7 @@ static void hwmp_preq_frame_process(struct net_device *dev,
 	orig_dsn = PREQ_IE_ORIG_DSN(preq_elem);
 	dst_flags = PREQ_IE_DST_F(preq_elem);
 
-	if (memcmp(dst_addr, dev->dev_addr, ETH_ALEN) == 0) {
+	if (memcmp(dst_addr, sdata->dev->dev_addr, ETH_ALEN) == 0) {
 		forward = false;
 		reply = true;
 		metric = 0;
@@ -423,7 +422,7 @@ static void hwmp_preq_frame_process(struct net_device *dev,
 		}
 	} else {
 		rcu_read_lock();
-		mpath = mesh_path_lookup(dst_addr, dev);
+		mpath = mesh_path_lookup(dst_addr, sdata);
 		if (mpath) {
 			if ((!(mpath->flags & MESH_PATH_DSN_VALID)) ||
 					DSN_LT(mpath->dsn, dst_dsn)) {
@@ -451,7 +450,7 @@ static void hwmp_preq_frame_process(struct net_device *dev,
 				cpu_to_le32(dst_dsn), 0, orig_addr,
 				cpu_to_le32(orig_dsn), mgmt->sa, 0, ttl,
 				cpu_to_le32(lifetime), cpu_to_le32(metric),
-				0, dev);
+				0, sdata);
 		else
 			ifsta->mshstats.dropped_frames_ttl++;
 	}
@@ -472,20 +471,19 @@ static void hwmp_preq_frame_process(struct net_device *dev,
 		hopcount = PREQ_IE_HOPCOUNT(preq_elem) + 1;
 		mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr,
 				cpu_to_le32(orig_dsn), dst_flags, dst_addr,
-				cpu_to_le32(dst_dsn), dev->broadcast,
+				cpu_to_le32(dst_dsn), sdata->dev->broadcast,
 				hopcount, ttl, cpu_to_le32(lifetime),
 				cpu_to_le32(metric), cpu_to_le32(preq_id),
-				dev);
+				sdata);
 		ifsta->mshstats.fwded_frames++;
 	}
 }
 
 
-static void hwmp_prep_frame_process(struct net_device *dev,
+static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_mgmt *mgmt,
 				    u8 *prep_elem, u32 metric)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct mesh_path *mpath;
 	u8 *dst_addr, *orig_addr;
 	u8 ttl, hopcount, flags;
@@ -499,7 +497,7 @@ static void hwmp_prep_frame_process(struct net_device *dev,
 	 * replies
 	 */
 	dst_addr = PREP_IE_DST_ADDR(prep_elem);
-	if (memcmp(dst_addr, dev->dev_addr, ETH_ALEN) == 0)
+	if (memcmp(dst_addr, sdata->dev->dev_addr, ETH_ALEN) == 0)
 		/* destination, no forwarding required */
 		return;
 
@@ -510,7 +508,7 @@ static void hwmp_prep_frame_process(struct net_device *dev,
 	}
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup(dst_addr, dev);
+	mpath = mesh_path_lookup(dst_addr, sdata);
 	if (mpath)
 		spin_lock_bh(&mpath->state_lock);
 	else
@@ -533,7 +531,7 @@ static void hwmp_prep_frame_process(struct net_device *dev,
 		cpu_to_le32(orig_dsn), 0, dst_addr,
 		cpu_to_le32(dst_dsn), mpath->next_hop->addr, hopcount, ttl,
 		cpu_to_le32(lifetime), cpu_to_le32(metric),
-		0, dev);
+		0, sdata);
 	rcu_read_unlock();
 	sdata->u.sta.mshstats.fwded_frames++;
 	return;
@@ -544,7 +542,7 @@ fail:
 	return;
 }
 
-static void hwmp_perr_frame_process(struct net_device *dev,
+static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
 			     struct ieee80211_mgmt *mgmt, u8 *perr_elem)
 {
 	struct mesh_path *mpath;
@@ -555,7 +553,7 @@ static void hwmp_perr_frame_process(struct net_device *dev,
 	dst_addr = PERR_IE_DST_ADDR(perr_elem);
 	dst_dsn = PERR_IE_DST_DSN(perr_elem);
 	rcu_read_lock();
-	mpath = mesh_path_lookup(dst_addr, dev);
+	mpath = mesh_path_lookup(dst_addr, sdata);
 	if (mpath) {
 		spin_lock_bh(&mpath->state_lock);
 		if (mpath->flags & MESH_PATH_ACTIVE &&
@@ -566,7 +564,7 @@ static void hwmp_perr_frame_process(struct net_device *dev,
 			mpath->dsn = dst_dsn;
 			spin_unlock_bh(&mpath->state_lock);
 			mesh_path_error_tx(dst_addr, cpu_to_le32(dst_dsn),
-					   dev->broadcast, dev);
+					   sdata->dev->broadcast, sdata);
 		} else
 			spin_unlock_bh(&mpath->state_lock);
 	}
@@ -575,7 +573,7 @@ static void hwmp_perr_frame_process(struct net_device *dev,
 
 
-void mesh_rx_path_sel_frame(struct net_device *dev,
+void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
 			    struct ieee80211_mgmt *mgmt,
 			    size_t len)
 {
@@ -592,25 +590,25 @@ void mesh_rx_path_sel_frame(struct net_device *dev,
 		if (!elems.preq || elems.preq_len != 37)
 			/* Right now we support just 1 destination and no AE */
 			return;
-		last_hop_metric = hwmp_route_info_get(dev, mgmt, elems.preq);
+		last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq);
 		if (!last_hop_metric)
 			return;
-		hwmp_preq_frame_process(dev, mgmt, elems.preq, last_hop_metric);
+		hwmp_preq_frame_process(sdata, mgmt, elems.preq, last_hop_metric);
 		break;
 	case MPATH_PREP:
 		if (!elems.prep || elems.prep_len != 31)
 			/* Right now we support no AE */
 			return;
-		last_hop_metric = hwmp_route_info_get(dev, mgmt, elems.prep);
+		last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep);
 		if (!last_hop_metric)
 			return;
-		hwmp_prep_frame_process(dev, mgmt, elems.prep, last_hop_metric);
+		hwmp_prep_frame_process(sdata, mgmt, elems.prep, last_hop_metric);
 		break;
 	case MPATH_PERR:
 		if (!elems.perr || elems.perr_len != 12)
 			/* Right now we support only one destination per PERR */
 			return;
-		hwmp_perr_frame_process(dev, mgmt, elems.perr);
+		hwmp_perr_frame_process(sdata, mgmt, elems.perr);
 	default:
 		return;
 	}
@@ -628,8 +626,7 @@ void mesh_rx_path_sel_frame(struct net_device *dev,
  */
 static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
 {
-	struct ieee80211_sub_if_data *sdata =
-		IEEE80211_DEV_TO_SUB_IF(mpath->dev);
+	struct ieee80211_sub_if_data *sdata = mpath->sdata;
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct mesh_preq_queue *preq_node;
 
@@ -672,12 +669,10 @@ static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
 /**
  * mesh_path_start_discovery - launch a path discovery from the PREQ queue
  *
- * @dev: local mesh interface
+ * @sdata: local mesh subif
  */
-void mesh_path_start_discovery(struct net_device *dev)
+void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata =
-		IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct mesh_preq_queue *preq_node;
 	struct mesh_path *mpath;
@@ -699,7 +694,7 @@ void mesh_path_start_discovery(struct net_device *dev)
 	spin_unlock(&ifsta->mesh_preq_queue_lock);
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup(preq_node->dst, dev);
+	mpath = mesh_path_lookup(preq_node->dst, sdata);
 	if (!mpath)
 		goto enddiscovery;
 
@@ -743,11 +738,11 @@ void mesh_path_start_discovery(struct net_device *dev)
 		dst_flags = MP_F_RF;
 
 	spin_unlock_bh(&mpath->state_lock);
-	mesh_path_sel_frame_tx(MPATH_PREQ, 0, dev->dev_addr,
+	mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->dev->dev_addr,
 			cpu_to_le32(ifsta->dsn), dst_flags, mpath->dst,
-			cpu_to_le32(mpath->dsn), dev->broadcast, 0,
+			cpu_to_le32(mpath->dsn), sdata->dev->broadcast, 0,
 			ttl, cpu_to_le32(lifetime), 0,
-			cpu_to_le32(ifsta->preq_id++), dev);
+			cpu_to_le32(ifsta->preq_id++), sdata);
 	mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout);
 
 enddiscovery:
@@ -759,7 +754,7 @@ enddiscovery:
  * ieee80211s_lookup_nexthop - put the appropriate next hop on a mesh frame
  *
  * @skb: 802.11 frame to be sent
- * @dev: network device the frame will be sent through
+ * @sdata: network subif the frame will be sent through
  * @fwd_frame: true if this frame was originally from a different host
  *
  * Returns: 0 if the next hop was found. Nonzero otherwise. If no next hop is
@@ -767,9 +762,9 @@ enddiscovery:
  * sent when the path is resolved. This means the caller must not free the skb
  * in this case.
  */
-int mesh_nexthop_lookup(struct sk_buff *skb, struct net_device *dev)
+int mesh_nexthop_lookup(struct sk_buff *skb,
+			struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct sk_buff *skb_to_free = NULL;
 	struct mesh_path *mpath;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -777,11 +772,11 @@ int mesh_nexthop_lookup(struct sk_buff *skb, struct net_device *dev)
 	int err = 0;
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup(dst_addr, dev);
+	mpath = mesh_path_lookup(dst_addr, sdata);
 
 	if (!mpath) {
-		mesh_path_add(dst_addr, dev);
-		mpath = mesh_path_lookup(dst_addr, dev);
+		mesh_path_add(dst_addr, sdata);
+		mpath = mesh_path_lookup(dst_addr, sdata);
 		if (!mpath) {
 			dev_kfree_skb(skb);
 			sdata->u.sta.mshstats.dropped_frames_no_route++;
@@ -793,7 +788,8 @@ int mesh_nexthop_lookup(struct sk_buff *skb, struct net_device *dev)
 	if (mpath->flags & MESH_PATH_ACTIVE) {
 		if (time_after(jiffies, mpath->exp_time -
 			msecs_to_jiffies(sdata->u.sta.mshcfg.path_refresh_time))
-				&& !memcmp(dev->dev_addr, hdr->addr4, ETH_ALEN)
+				&& !memcmp(sdata->dev->dev_addr, hdr->addr4,
+					   ETH_ALEN)
 				&& !(mpath->flags & MESH_PATH_RESOLVING)
 				&& !(mpath->flags & MESH_PATH_FIXED)) {
 			mesh_queue_preq(mpath,
@@ -815,7 +811,7 @@ int mesh_nexthop_lookup(struct sk_buff *skb, struct net_device *dev)
 
 		skb_queue_tail(&mpath->frame_queue, skb);
 		if (skb_to_free)
-			mesh_path_discard_frame(skb_to_free, dev);
+			mesh_path_discard_frame(skb_to_free, sdata);
 		err = -ENOENT;
 	}
 
@@ -835,7 +831,7 @@ void mesh_path_timer(unsigned long data)
 	if (!mpath)
 		goto endmpathtimer;
 	spin_lock_bh(&mpath->state_lock);
-	sdata = IEEE80211_DEV_TO_SUB_IF(mpath->dev);
+	sdata = mpath->sdata;
 	if (mpath->flags & MESH_PATH_RESOLVED ||
 			(!(mpath->flags & MESH_PATH_RESOLVING)))
 		mpath->flags &= ~(MESH_PATH_RESOLVING | MESH_PATH_RESOLVED);
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 838ee60492a..0a60f55f32a 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -9,7 +9,6 @@
 
 #include <linux/etherdevice.h>
 #include <linux/list.h>
-#include <linux/netdevice.h>
 #include <linux/random.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
@@ -62,13 +61,13 @@ void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta)
 /**
  * mesh_path_lookup - look up a path in the mesh path table
  * @dst: hardware address (ETH_ALEN length) of destination
- * @dev: local interface
+ * @sdata: local subif
  *
  * Returns: pointer to the mesh path structure, or NULL if not found
  *
  * Locking: must be called within a read rcu section.
  */
-struct mesh_path *mesh_path_lookup(u8 *dst, struct net_device *dev)
+struct mesh_path *mesh_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata)
 {
 	struct mesh_path *mpath;
 	struct hlist_node *n;
@@ -78,10 +77,10 @@ struct mesh_path *mesh_path_lookup(u8 *dst, struct net_device *dev)
 
 	tbl = rcu_dereference(mesh_paths);
 
-	bucket = &tbl->hash_buckets[mesh_table_hash(dst, dev, tbl)];
+	bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)];
 	hlist_for_each_entry_rcu(node, n, bucket, list) {
 		mpath = node->mpath;
-		if (mpath->dev == dev &&
+		if (mpath->sdata == sdata &&
 				memcmp(dst, mpath->dst, ETH_ALEN) == 0) {
 			if (MPATH_EXPIRED(mpath)) {
 				spin_lock_bh(&mpath->state_lock);
@@ -98,13 +97,13 @@ struct mesh_path *mesh_path_lookup(u8 *dst, struct net_device *dev)
 /**
  * mesh_path_lookup_by_idx - look up a path in the mesh path table by its index
  * @idx: index
- * @dev: local interface, or NULL for all entries
+ * @sdata: local subif, or NULL for all entries
  *
  * Returns: pointer to the mesh path structure, or NULL if not found.
  *
  * Locking: must be called within a read rcu section.
  */
-struct mesh_path *mesh_path_lookup_by_idx(int idx, struct net_device *dev)
+struct mesh_path *mesh_path_lookup_by_idx(int idx, struct ieee80211_sub_if_data *sdata)
 {
 	struct mpath_node *node;
 	struct hlist_node *p;
@@ -112,7 +111,7 @@ struct mesh_path *mesh_path_lookup_by_idx(int idx, struct net_device *dev)
 	int j = 0;
 
 	for_each_mesh_entry(mesh_paths, p, node, i) {
-		if (dev && node->mpath->dev != dev)
+		if (sdata && node->mpath->sdata != sdata)
 			continue;
 		if (j++ == idx) {
 			if (MPATH_EXPIRED(node->mpath)) {
@@ -131,15 +130,14 @@ struct mesh_path *mesh_path_lookup_by_idx(int idx, struct net_device *dev)
 /**
  * mesh_path_add - allocate and add a new path to the mesh path table
  * @addr: destination address of the path (ETH_ALEN length)
- * @dev: local interface
+ * @sdata: local subif
  *
  * Returns: 0 on sucess
  *
  * State: the initial state of the new path is set to 0
  */
-int mesh_path_add(u8 *dst, struct net_device *dev)
+int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct mesh_path *mpath, *new_mpath;
 	struct mpath_node *node, *new_node;
 	struct hlist_head *bucket;
@@ -148,7 +146,7 @@ int mesh_path_add(u8 *dst, struct net_device *dev)
 	int err = 0;
 	u32 hash_idx;
 
-	if (memcmp(dst, dev->dev_addr, ETH_ALEN) == 0)
+	if (memcmp(dst, sdata->dev->dev_addr, ETH_ALEN) == 0)
 		/* never add ourselves as neighbours */
 		return -ENOTSUPP;
 
@@ -169,7 +167,7 @@ int mesh_path_add(u8 *dst, struct net_device *dev)
 
 	read_lock(&pathtbl_resize_lock);
 	memcpy(new_mpath->dst, dst, ETH_ALEN);
-	new_mpath->dev = dev;
+	new_mpath->sdata = sdata;
 	new_mpath->flags = 0;
 	skb_queue_head_init(&new_mpath->frame_queue);
 	new_node->mpath = new_mpath;
@@ -179,7 +177,7 @@ int mesh_path_add(u8 *dst, struct net_device *dev)
 	spin_lock_init(&new_mpath->state_lock);
 	init_timer(&new_mpath->timer);
 
-	hash_idx = mesh_table_hash(dst, dev, mesh_paths);
+	hash_idx = mesh_table_hash(dst, sdata, mesh_paths);
 	bucket = &mesh_paths->hash_buckets[hash_idx];
 
 	spin_lock(&mesh_paths->hashwlock[hash_idx]);
@@ -187,7 +185,7 @@ int mesh_path_add(u8 *dst, struct net_device *dev)
 	err = -EEXIST;
 	hlist_for_each_entry(node, n, bucket, list) {
 		mpath = node->mpath;
-		if (mpath->dev == dev && memcmp(dst, mpath->dst, ETH_ALEN) == 0)
+		if (mpath->sdata == sdata && memcmp(dst, mpath->dst, ETH_ALEN) == 0)
 			goto err_exists;
 	}
 
@@ -241,7 +239,7 @@ void mesh_plink_broken(struct sta_info *sta)
 	struct mesh_path *mpath;
 	struct mpath_node *node;
 	struct hlist_node *p;
-	struct net_device *dev = sta->sdata->dev;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	int i;
 
 	rcu_read_lock();
@@ -256,7 +254,7 @@ void mesh_plink_broken(struct sta_info *sta)
 			spin_unlock_bh(&mpath->state_lock);
 			mesh_path_error_tx(mpath->dst,
 					cpu_to_le32(mpath->dsn),
-					dev->broadcast, dev);
+					sdata->dev->broadcast, sdata);
 		} else
 		spin_unlock_bh(&mpath->state_lock);
 	}
@@ -284,11 +282,11 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
 	for_each_mesh_entry(mesh_paths, p, node, i) {
 		mpath = node->mpath;
 		if (mpath->next_hop == sta)
-			mesh_path_del(mpath->dst, mpath->dev);
+			mesh_path_del(mpath->dst, mpath->sdata);
 	}
 }
 
-void mesh_path_flush(struct net_device *dev)
+void mesh_path_flush(struct ieee80211_sub_if_data *sdata)
 {
 	struct mesh_path *mpath;
 	struct mpath_node *node;
@@ -297,16 +295,15 @@ void mesh_path_flush(struct net_device *dev)
 
 	for_each_mesh_entry(mesh_paths, p, node, i) {
 		mpath = node->mpath;
-		if (mpath->dev == dev)
-			mesh_path_del(mpath->dst, mpath->dev);
+		if (mpath->sdata == sdata)
+			mesh_path_del(mpath->dst, mpath->sdata);
 	}
 }
 
 static void mesh_path_node_reclaim(struct rcu_head *rp)
 {
 	struct mpath_node *node = container_of(rp, struct mpath_node, rcu);
-	struct ieee80211_sub_if_data *sdata =
-		IEEE80211_DEV_TO_SUB_IF(node->mpath->dev);
+	struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
 
 	del_timer_sync(&node->mpath->timer);
 	atomic_dec(&sdata->u.sta.mpaths);
@@ -318,11 +315,11 @@ static void mesh_path_node_reclaim(struct rcu_head *rp)
  * mesh_path_del - delete a mesh path from the table
  *
  * @addr: dst address (ETH_ALEN length)
- * @dev: local interface
+ * @sdata: local subif
  *
  * Returns: 0 if succesful
  */
-int mesh_path_del(u8 *addr, struct net_device *dev)
+int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata)
 {
 	struct mesh_path *mpath;
 	struct mpath_node *node;
@@ -332,13 +329,13 @@ int mesh_path_del(u8 *addr, struct net_device *dev)
 	int err = 0;
 
 	read_lock(&pathtbl_resize_lock);
-	hash_idx = mesh_table_hash(addr, dev, mesh_paths);
+	hash_idx = mesh_table_hash(addr, sdata, mesh_paths);
 	bucket = &mesh_paths->hash_buckets[hash_idx];
 
 	spin_lock(&mesh_paths->hashwlock[hash_idx]);
 	hlist_for_each_entry(node, n, bucket, list) {
 		mpath = node->mpath;
-		if (mpath->dev == dev &&
+		if (mpath->sdata == sdata &&
 				memcmp(addr, mpath->dst, ETH_ALEN) == 0) {
 			spin_lock_bh(&mpath->state_lock);
 			mpath->flags |= MESH_PATH_RESOLVING;
@@ -378,29 +375,29 @@ void mesh_path_tx_pending(struct mesh_path *mpath)
  * mesh_path_discard_frame - discard a frame whose path could not be resolved
  *
  * @skb: frame to discard
- * @dev: network device the frame was to be sent through
+ * @sdata: network subif the frame was to be sent through
  *
  * If the frame was beign forwarded from another MP, a PERR frame will be sent
  * to the precursor.
  *
  * Locking: the function must me called within a rcu_read_lock region
  */
-void mesh_path_discard_frame(struct sk_buff *skb, struct net_device *dev)
+void mesh_path_discard_frame(struct sk_buff *skb,
+			     struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct mesh_path *mpath;
 	u32 dsn = 0;
 
-	if (memcmp(hdr->addr4, dev->dev_addr, ETH_ALEN) != 0) {
+	if (memcmp(hdr->addr4, sdata->dev->dev_addr, ETH_ALEN) != 0) {
 		u8 *ra, *da;
 
 		da = hdr->addr3;
 		ra = hdr->addr2;
-		mpath = mesh_path_lookup(da, dev);
+		mpath = mesh_path_lookup(da, sdata);
 		if (mpath)
 			dsn = ++mpath->dsn;
-		mesh_path_error_tx(skb->data, cpu_to_le32(dsn), ra, dev);
+		mesh_path_error_tx(skb->data, cpu_to_le32(dsn), ra, sdata);
 	}
 
 	kfree_skb(skb);
@@ -416,14 +413,11 @@ void mesh_path_discard_frame(struct sk_buff *skb, struct net_device *dev)
  */
 void mesh_path_flush_pending(struct mesh_path *mpath)
 {
-	struct ieee80211_sub_if_data *sdata;
 	struct sk_buff *skb;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(mpath->dev);
-
 	while ((skb = skb_dequeue(&mpath->frame_queue)) &&
 			(mpath->flags & MESH_PATH_ACTIVE))
-		mesh_path_discard_frame(skb, mpath->dev);
+		mesh_path_discard_frame(skb, mpath->sdata);
 }
 
 /**
@@ -472,7 +466,7 @@ static int mesh_path_node_copy(struct hlist_node *p, struct mesh_table *newtbl)
 	node = hlist_entry(p, struct mpath_node, list);
 	mpath = node->mpath;
 	new_node->mpath = mpath;
-	hash_idx = mesh_table_hash(mpath->dst, mpath->dev, newtbl);
+	hash_idx = mesh_table_hash(mpath->dst, mpath->sdata, newtbl);
 	hlist_add_head(&new_node->list,
 			&newtbl->hash_buckets[hash_idx]);
 	return 0;
@@ -489,7 +483,7 @@ int mesh_pathtbl_init(void)
 	return 0;
 }
 
-void mesh_path_expire(struct net_device *dev)
+void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
 {
 	struct mesh_path *mpath;
 	struct mpath_node *node;
@@ -498,7 +492,7 @@ void mesh_path_expire(struct net_device *dev)
 
 	read_lock(&pathtbl_resize_lock);
 	for_each_mesh_entry(mesh_paths, p, node, i) {
-		if (node->mpath->dev != dev)
+		if (node->mpath->sdata != sdata)
 			continue;
 		mpath = node->mpath;
 		spin_lock_bh(&mpath->state_lock);
@@ -507,7 +501,7 @@ void mesh_path_expire(struct net_device *dev)
 			time_after(jiffies,
 			 mpath->exp_time + MESH_PATH_EXPIRE)) {
 			spin_unlock_bh(&mpath->state_lock);
-			mesh_path_del(mpath->dst, mpath->dev);
+			mesh_path_del(mpath->dst, mpath->sdata);
 		} else
 			spin_unlock_bh(&mpath->state_lock);
 	}
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 4a7e6d094ff..7714b0e6e4d 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -144,10 +144,10 @@ void mesh_plink_deactivate(struct sta_info *sta)
 	spin_unlock_bh(&sta->lock);
 }
 
-static int mesh_plink_frame_tx(struct net_device *dev,
+static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
 		enum plink_frame_type action, u8 *da, __le16 llid, __le16 plid,
 		__le16 reason) {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
 	struct ieee80211_mgmt *mgmt;
 	bool include_plid = false;
@@ -166,7 +166,7 @@ static int mesh_plink_frame_tx(struct net_device *dev,
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_ACTION);
 	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	/* BSSID is left zeroed, wildcard value */
 	mgmt->u.action.category = PLINK_CATEGORY;
 	mgmt->u.action.u.plink_action.action_code = action;
@@ -180,7 +180,7 @@ static int mesh_plink_frame_tx(struct net_device *dev,
 			/* two-byte status code followed by two-byte AID */
 			memset(pos, 0, 4);
 		}
-		mesh_mgmt_ies_add(skb, dev);
+		mesh_mgmt_ies_add(skb, sdata);
 	}
 
 	/* Add Peer Link Management element */
@@ -217,15 +217,14 @@ static int mesh_plink_frame_tx(struct net_device *dev,
 		memcpy(pos, &reason, 2);
 	}
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 	return 0;
 }
 
-void mesh_neighbour_update(u8 *hw_addr, u64 rates, struct net_device *dev,
+void mesh_neighbour_update(u8 *hw_addr, u64 rates, struct ieee80211_sub_if_data *sdata,
 			   bool peer_accepting_plinks)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 
 	rcu_read_lock();
@@ -257,7 +256,6 @@ static void mesh_plink_timer(unsigned long data)
 {
 	struct sta_info *sta;
 	__le16 llid, plid, reason;
-	struct net_device *dev = NULL;
 	struct ieee80211_sub_if_data *sdata;
 #ifdef CONFIG_MAC80211_VERBOSE_MPL_DEBUG
 	DECLARE_MAC_BUF(mac);
@@ -282,7 +280,6 @@ static void mesh_plink_timer(unsigned long data)
 	llid = sta->llid;
 	plid = sta->plid;
 	sdata = sta->sdata;
-	dev = sdata->dev;
 
 	switch (sta->plink_state) {
 	case PLINK_OPN_RCVD:
@@ -299,7 +296,7 @@ static void mesh_plink_timer(unsigned long data)
 			++sta->plink_retries;
 			mod_plink_timer(sta, sta->plink_timeout);
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_OPEN, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->addr, llid,
 					    0, 0);
 			break;
 		}
@@ -312,7 +309,7 @@ static void mesh_plink_timer(unsigned long data)
 		sta->plink_state = PLINK_HOLDING;
 		mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata));
 		spin_unlock_bh(&sta->lock);
-		mesh_plink_frame_tx(dev, PLINK_CLOSE, sta->addr, llid, plid,
+		mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid, plid,
 				    reason);
 		break;
 	case PLINK_HOLDING:
@@ -357,7 +354,7 @@ int mesh_plink_open(struct sta_info *sta)
 	mpl_dbg("Mesh plink: starting establishment with %s\n",
 		print_mac(mac, sta->addr));
 
-	return mesh_plink_frame_tx(sdata->dev, PLINK_OPEN,
+	return mesh_plink_frame_tx(sdata, PLINK_OPEN,
 				   sta->addr, llid, 0, 0);
 }
 
@@ -403,15 +400,14 @@ int mesh_plink_close(struct sta_info *sta)
 	llid = sta->llid;
 	plid = sta->plid;
 	spin_unlock_bh(&sta->lock);
-	mesh_plink_frame_tx(sta->sdata->dev, PLINK_CLOSE, sta->addr, llid,
+	mesh_plink_frame_tx(sta->sdata, PLINK_CLOSE, sta->addr, llid,
 			    plid, reason);
 	return 0;
 }
 
-void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
+void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt,
 			 size_t len, struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
 	struct ieee802_11_elems elems;
 	struct sta_info *sta;
@@ -478,7 +474,7 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 
 	/* Now we will figure out the appropriate event... */
 	event = PLINK_UNDEFINED;
-	if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, dev))) {
+	if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) {
 		switch (ftype) {
 		case PLINK_OPEN:
 			event = OPN_RJCT;
@@ -577,9 +573,9 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 			sta->llid = llid;
 			mesh_plink_timer_set(sta, dot11MeshRetryTimeout(sdata));
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_OPEN, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->addr, llid,
 					    0, 0);
-			mesh_plink_frame_tx(dev, PLINK_CONFIRM, sta->addr,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr,
 					    llid, plid, 0);
 			break;
 		default:
@@ -604,7 +600,7 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
@@ -613,7 +609,7 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 			sta->plid = plid;
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CONFIRM, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
 					    plid, 0);
 			break;
 		case CNF_ACPT:
@@ -646,13 +642,13 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CONFIRM, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
 					    plid, 0);
 			break;
 		case CNF_ACPT:
@@ -685,7 +681,7 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
@@ -695,7 +691,7 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 			spin_unlock_bh(&sta->lock);
 			mpl_dbg("Mesh plink with %s ESTABLISHED\n",
 					print_mac(mac, sta->addr));
-			mesh_plink_frame_tx(dev, PLINK_CONFIRM, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
 					    plid, 0);
 			break;
 		default:
@@ -714,13 +710,13 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 			llid = sta->llid;
 			mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata));
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CONFIRM, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
 					    plid, 0);
 			break;
 		default:
@@ -743,7 +739,7 @@ void mesh_rx_plink_frame(struct net_device *dev, struct ieee80211_mgmt *mgmt,
 			llid = sta->llid;
 			reason = sta->reason;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(dev, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
 					    plid, reason);
 			break;
 		default:
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ac776c9d48f..fb8e1e742ca 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -73,19 +73,19 @@
 #define IEEE80211_MIN_AMPDU_BUF 0x8
 #define IEEE80211_MAX_AMPDU_BUF 0x40
 
-static void ieee80211_send_probe_req(struct net_device *dev, u8 *dst,
+static void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 				     u8 *ssid, size_t ssid_len);
 static struct ieee80211_sta_bss *
-ieee80211_rx_bss_get(struct net_device *dev, u8 *bssid, int freq,
+ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len);
 static void ieee80211_rx_bss_put(struct ieee80211_local *local,
 				 struct ieee80211_sta_bss *bss);
-static int ieee80211_sta_find_ibss(struct net_device *dev,
+static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta);
-static int ieee80211_sta_wep_configured(struct net_device *dev);
-static int ieee80211_sta_start_scan(struct net_device *dev,
+static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata);
+static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *sdata,
 				    u8 *ssid, size_t ssid_len);
-static int ieee80211_sta_config_auth(struct net_device *dev,
+static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta);
 static void sta_rx_agg_session_timer_expired(unsigned long data);
 
@@ -239,11 +239,10 @@ static int ecw2cw(int ecw)
 }
 
 
-static void ieee80211_sta_def_wmm_params(struct net_device *dev,
+static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_sta_bss *bss,
 					 int ibss)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
 	int i, have_higher_than_11mbit = 0;
 
@@ -281,11 +280,10 @@ static void ieee80211_sta_def_wmm_params(struct net_device *dev,
 	}
 }
 
-static void ieee80211_sta_wmm_params(struct net_device *dev,
+static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
 				     struct ieee80211_if_sta *ifsta,
 				     u8 *wmm_param, size_t wmm_param_len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_tx_queue_params params;
 	size_t left;
 	int count;
@@ -349,14 +347,14 @@ static void ieee80211_sta_wmm_params(struct net_device *dev,
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 		printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d "
 		       "cWmin=%d cWmax=%d txop=%d\n",
-		       dev->name, queue, aci, acm, params.aifs, params.cw_min,
+		       local->mdev->name, queue, aci, acm, params.aifs, params.cw_min,
 		       params.cw_max, params.txop);
 #endif
 		/* TODO: handle ACM (block TX, fallback to next lowest allowed
 		 * AC for now) */
 		if (local->ops->conf_tx(local_to_hw(local), queue, &params)) {
 			printk(KERN_DEBUG "%s: failed to set TX queue "
-			       "parameters for queue %d\n", dev->name, queue);
+			       "parameters for queue %d\n", local->mdev->name, queue);
 		}
 	}
 }
@@ -475,7 +473,7 @@ int ieee80211_ht_addt_info_ie_to_ht_bss_info(
 	return 0;
 }
 
-static void ieee80211_sta_send_associnfo(struct net_device *dev,
+static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_if_sta *ifsta)
 {
 	char *buf;
@@ -520,17 +518,16 @@ static void ieee80211_sta_send_associnfo(struct net_device *dev,
 
 	memset(&wrqu, 0, sizeof(wrqu));
 	wrqu.data.length = len;
-	wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
+	wireless_send_event(sdata->dev, IWEVCUSTOM, &wrqu, buf);
 
 	kfree(buf);
 }
 
 
-static void ieee80211_set_associated(struct net_device *dev,
+static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta,
 				     bool assoc)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_conf *conf = &local_to_hw(local)->conf;
 	union iwreq_data wrqu;
@@ -544,7 +541,7 @@ static void ieee80211_set_associated(struct net_device *dev,
 		if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 			return;
 
-		bss = ieee80211_rx_bss_get(dev, ifsta->bssid,
+		bss = ieee80211_rx_bss_get(local, ifsta->bssid,
 					   conf->channel->center_freq,
 					   ifsta->ssid, ifsta->ssid_len);
 		if (bss) {
@@ -568,12 +565,12 @@ static void ieee80211_set_associated(struct net_device *dev,
 		ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
 		memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
 		memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN);
-		ieee80211_sta_send_associnfo(dev, ifsta);
+		ieee80211_sta_send_associnfo(sdata, ifsta);
 	} else {
-		netif_carrier_off(dev);
-		ieee80211_sta_tear_down_BA_sessions(dev, ifsta->bssid);
+		netif_carrier_off(sdata->dev);
+		ieee80211_sta_tear_down_BA_sessions(sdata, ifsta->bssid);
 		ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
-		changed |= ieee80211_reset_erp_info(dev);
+		changed |= ieee80211_reset_erp_info(sdata);
 
 		sdata->bss_conf.assoc_ht = 0;
 		sdata->bss_conf.ht_conf = NULL;
@@ -588,27 +585,24 @@ static void ieee80211_set_associated(struct net_device *dev,
 	ieee80211_bss_info_change_notify(sdata, changed);
 
 	if (assoc)
-		netif_carrier_on(dev);
+		netif_carrier_on(sdata->dev);
 
 	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
-	wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
 }
 
-static void ieee80211_set_disassoc(struct net_device *dev,
+static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta, int deauth)
 {
 	if (deauth)
 		ifsta->auth_tries = 0;
 	ifsta->assoc_tries = 0;
-	ieee80211_set_associated(dev, ifsta, 0);
+	ieee80211_set_associated(sdata, ifsta, 0);
 }
 
-void ieee80211_sta_tx(struct net_device *dev, struct sk_buff *skb,
+void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt)
 {
-	struct ieee80211_sub_if_data *sdata;
-
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	skb->dev = sdata->local->mdev;
 	skb_set_mac_header(skb, 0);
 	skb_set_network_header(skb, 0);
@@ -621,12 +615,12 @@ void ieee80211_sta_tx(struct net_device *dev, struct sk_buff *skb,
 }
 
 
-static void ieee80211_send_auth(struct net_device *dev,
+static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 				struct ieee80211_if_sta *ifsta,
 				int transaction, u8 *extra, size_t extra_len,
 				int encrypt)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 
@@ -634,7 +628,7 @@ static void ieee80211_send_auth(struct net_device *dev,
 			    sizeof(*mgmt) + 6 + extra_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
-		       "frame\n", dev->name);
+		       "frame\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
@@ -646,7 +640,7 @@ static void ieee80211_send_auth(struct net_device *dev,
 	if (encrypt)
 		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 	mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
 	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
@@ -655,11 +649,11 @@ static void ieee80211_send_auth(struct net_device *dev,
 	if (extra)
 		memcpy(skb_put(skb, extra_len), extra, extra_len);
 
-	ieee80211_sta_tx(dev, skb, encrypt);
+	ieee80211_sta_tx(sdata, skb, encrypt);
 }
 
 
-static void ieee80211_authenticate(struct net_device *dev,
+static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta)
 {
 	DECLARE_MAC_BUF(mac);
@@ -668,16 +662,16 @@ static void ieee80211_authenticate(struct net_device *dev,
 	if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
 		printk(KERN_DEBUG "%s: authentication with AP %s"
 		       " timed out\n",
-		       dev->name, print_mac(mac, ifsta->bssid));
+		       sdata->dev->name, print_mac(mac, ifsta->bssid));
 		ifsta->state = IEEE80211_DISABLED;
 		return;
 	}
 
 	ifsta->state = IEEE80211_AUTHENTICATE;
 	printk(KERN_DEBUG "%s: authenticate with AP %s\n",
-	       dev->name, print_mac(mac, ifsta->bssid));
+	       sdata->dev->name, print_mac(mac, ifsta->bssid));
 
-	ieee80211_send_auth(dev, ifsta, 1, NULL, 0, 0);
+	ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0);
 
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
 }
@@ -703,10 +697,10 @@ static int ieee80211_compatible_rates(struct ieee80211_sta_bss *bss,
 	return count;
 }
 
-static void ieee80211_send_assoc(struct net_device *dev,
+static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 				 struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 	u8 *pos, *ies;
@@ -722,7 +716,7 @@ static void ieee80211_send_assoc(struct net_device *dev,
 			    ifsta->ssid_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
-		       "frame\n", dev->name);
+		       "frame\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
@@ -738,7 +732,7 @@ static void ieee80211_send_assoc(struct net_device *dev,
 			capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
 	}
 
-	bss = ieee80211_rx_bss_get(dev, ifsta->bssid,
+	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
 				   local->hw.conf.channel->center_freq,
 				   ifsta->ssid, ifsta->ssid_len);
 	if (bss) {
@@ -766,7 +760,7 @@ static void ieee80211_send_assoc(struct net_device *dev,
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 
 	if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
@@ -907,21 +901,21 @@ static void ieee80211_send_assoc(struct net_device *dev,
 	if (ifsta->assocreq_ies)
 		memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
 
-static void ieee80211_send_deauth(struct net_device *dev,
+static void ieee80211_send_deauth(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_if_sta *ifsta, u16 reason)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for deauth "
-		       "frame\n", dev->name);
+		       "frame\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
@@ -929,28 +923,28 @@ static void ieee80211_send_deauth(struct net_device *dev,
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_DEAUTH);
 	skb_put(skb, 2);
 	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
 
-static void ieee80211_send_disassoc(struct net_device *dev,
+static void ieee80211_send_disassoc(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_if_sta *ifsta, u16 reason)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for disassoc "
-		       "frame\n", dev->name);
+		       "frame\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
@@ -958,21 +952,21 @@ static void ieee80211_send_disassoc(struct net_device *dev,
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_DISASSOC);
 	skb_put(skb, 2);
 	mgmt->u.disassoc.reason_code = cpu_to_le16(reason);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
 
-static int ieee80211_privacy_mismatch(struct net_device *dev,
+static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sta_bss *bss;
 	int bss_privacy;
 	int wep_privacy;
@@ -981,14 +975,14 @@ static int ieee80211_privacy_mismatch(struct net_device *dev,
 	if (!ifsta || (ifsta->flags & IEEE80211_STA_MIXED_CELL))
 		return 0;
 
-	bss = ieee80211_rx_bss_get(dev, ifsta->bssid,
+	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
 				   local->hw.conf.channel->center_freq,
 				   ifsta->ssid, ifsta->ssid_len);
 	if (!bss)
 		return 0;
 
 	bss_privacy = !!(bss->capability & WLAN_CAPABILITY_PRIVACY);
-	wep_privacy = !!ieee80211_sta_wep_configured(dev);
+	wep_privacy = !!ieee80211_sta_wep_configured(sdata);
 	privacy_invoked = !!(ifsta->flags & IEEE80211_STA_PRIVACY_INVOKED);
 
 	ieee80211_rx_bss_put(local, bss);
@@ -1000,7 +994,7 @@ static int ieee80211_privacy_mismatch(struct net_device *dev,
 }
 
 
-static void ieee80211_associate(struct net_device *dev,
+static void ieee80211_associate(struct ieee80211_sub_if_data *sdata,
 				struct ieee80211_if_sta *ifsta)
 {
 	DECLARE_MAC_BUF(mac);
@@ -1009,31 +1003,31 @@ static void ieee80211_associate(struct net_device *dev,
 	if (ifsta->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) {
 		printk(KERN_DEBUG "%s: association with AP %s"
 		       " timed out\n",
-		       dev->name, print_mac(mac, ifsta->bssid));
+		       sdata->dev->name, print_mac(mac, ifsta->bssid));
 		ifsta->state = IEEE80211_DISABLED;
 		return;
 	}
 
 	ifsta->state = IEEE80211_ASSOCIATE;
 	printk(KERN_DEBUG "%s: associate with AP %s\n",
-	       dev->name, print_mac(mac, ifsta->bssid));
-	if (ieee80211_privacy_mismatch(dev, ifsta)) {
+	       sdata->dev->name, print_mac(mac, ifsta->bssid));
+	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
 		printk(KERN_DEBUG "%s: mismatch in privacy configuration and "
-		       "mixed-cell disabled - abort association\n", dev->name);
+		       "mixed-cell disabled - abort association\n", sdata->dev->name);
 		ifsta->state = IEEE80211_DISABLED;
 		return;
 	}
 
-	ieee80211_send_assoc(dev, ifsta);
+	ieee80211_send_assoc(sdata, ifsta);
 
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_ASSOC_TIMEOUT);
 }
 
 
-static void ieee80211_associated(struct net_device *dev,
+static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 				 struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	int disassoc;
 	DECLARE_MAC_BUF(mac);
@@ -1050,7 +1044,7 @@ static void ieee80211_associated(struct net_device *dev,
 	sta = sta_info_get(local, ifsta->bssid);
 	if (!sta) {
 		printk(KERN_DEBUG "%s: No STA entry for own AP %s\n",
-		       dev->name, print_mac(mac, ifsta->bssid));
+		       sdata->dev->name, print_mac(mac, ifsta->bssid));
 		disassoc = 1;
 	} else {
 		disassoc = 0;
@@ -1060,11 +1054,11 @@ static void ieee80211_associated(struct net_device *dev,
 				printk(KERN_DEBUG "%s: No ProbeResp from "
 				       "current AP %s - assume out of "
 				       "range\n",
-				       dev->name, print_mac(mac, ifsta->bssid));
+				       sdata->dev->name, print_mac(mac, ifsta->bssid));
 				disassoc = 1;
 				sta_info_unlink(&sta);
 			} else
-				ieee80211_send_probe_req(dev, ifsta->bssid,
+				ieee80211_send_probe_req(sdata, ifsta->bssid,
 							 local->scan_ssid,
 							 local->scan_ssid_len);
 			ifsta->flags ^= IEEE80211_STA_PROBEREQ_POLL;
@@ -1073,7 +1067,7 @@ static void ieee80211_associated(struct net_device *dev,
 			if (time_after(jiffies, ifsta->last_probe +
 				       IEEE80211_PROBE_INTERVAL)) {
 				ifsta->last_probe = jiffies;
-				ieee80211_send_probe_req(dev, ifsta->bssid,
+				ieee80211_send_probe_req(sdata, ifsta->bssid,
 							 ifsta->ssid,
 							 ifsta->ssid_len);
 			}
@@ -1087,7 +1081,7 @@ static void ieee80211_associated(struct net_device *dev,
 
 	if (disassoc) {
 		ifsta->state = IEEE80211_DISABLED;
-		ieee80211_set_associated(dev, ifsta, 0);
+		ieee80211_set_associated(sdata, ifsta, 0);
 	} else {
 		mod_timer(&ifsta->timer, jiffies +
 				      IEEE80211_MONITORING_INTERVAL);
@@ -1095,10 +1089,10 @@ static void ieee80211_associated(struct net_device *dev,
 }
 
 
-static void ieee80211_send_probe_req(struct net_device *dev, u8 *dst,
+static void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 				     u8 *ssid, size_t ssid_len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
@@ -1108,7 +1102,7 @@ static void ieee80211_send_probe_req(struct net_device *dev, u8 *dst,
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
-		       "request\n", dev->name);
+		       "request\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
@@ -1117,7 +1111,7 @@ static void ieee80211_send_probe_req(struct net_device *dev, u8 *dst,
 	memset(mgmt, 0, 24);
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_PROBE_REQ);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	if (dst) {
 		memcpy(mgmt->da, dst, ETH_ALEN);
 		memcpy(mgmt->bssid, dst, ETH_ALEN);
@@ -1152,13 +1146,12 @@ static void ieee80211_send_probe_req(struct net_device *dev, u8 *dst,
 		*pos = rate->bitrate / 5;
 	}
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
 
-static int ieee80211_sta_wep_configured(struct net_device *dev)
+static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	if (!sdata || !sdata->default_key ||
 	    sdata->default_key->conf.alg != ALG_WEP)
 		return 0;
@@ -1166,16 +1159,16 @@ static int ieee80211_sta_wep_configured(struct net_device *dev)
 }
 
 
-static void ieee80211_auth_completed(struct net_device *dev,
+static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
-	printk(KERN_DEBUG "%s: authenticated\n", dev->name);
+	printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name);
 	ifsta->flags |= IEEE80211_STA_AUTHENTICATED;
-	ieee80211_associate(dev, ifsta);
+	ieee80211_associate(sdata, ifsta);
 }
 
 
-static void ieee80211_auth_challenge(struct net_device *dev,
+static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len)
@@ -1187,17 +1180,16 @@ static void ieee80211_auth_challenge(struct net_device *dev,
 	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
 	if (!elems.challenge)
 		return;
-	ieee80211_send_auth(dev, ifsta, 3, elems.challenge - 2,
+	ieee80211_send_auth(sdata, ifsta, 3, elems.challenge - 2,
 			    elems.challenge_len + 2, 1);
 }
 
-static void ieee80211_send_addba_resp(struct net_device *dev, u8 *da, u16 tid,
+static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
 					u8 dialog_token, u16 status, u16 policy,
 					u16 buf_size, u16 timeout)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 	u16 capab;
@@ -1206,7 +1198,7 @@ static void ieee80211_send_addba_resp(struct net_device *dev, u8 *da, u16 tid,
 
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer "
-		       "for addba resp frame\n", dev->name);
+		       "for addba resp frame\n", sdata->dev->name);
 		return;
 	}
 
@@ -1214,9 +1206,9 @@ static void ieee80211_send_addba_resp(struct net_device *dev, u8 *da, u16 tid,
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
-		memcpy(mgmt->bssid, dev->dev_addr, ETH_ALEN);
+		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
@@ -1235,17 +1227,16 @@ static void ieee80211_send_addba_resp(struct net_device *dev, u8 *da, u16 tid,
 	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
 	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 
 	return;
 }
 
-void ieee80211_send_addba_request(struct net_device *dev, const u8 *da,
+void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da,
 				u16 tid, u8 dialog_token, u16 start_seq_num,
 				u16 agg_size, u16 timeout)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
@@ -1255,16 +1246,16 @@ void ieee80211_send_addba_request(struct net_device *dev, const u8 *da,
 
 	if (!skb) {
 		printk(KERN_ERR "%s: failed to allocate buffer "
-				"for addba request frame\n", dev->name);
+				"for addba request frame\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
-		memcpy(mgmt->bssid, dev->dev_addr, ETH_ALEN);
+		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 
@@ -1287,14 +1278,13 @@ void ieee80211_send_addba_request(struct net_device *dev, const u8 *da,
 	mgmt->u.action.u.addba_req.start_seq_num =
 					cpu_to_le16(start_seq_num << 4);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-static void ieee80211_sta_process_addba_request(struct net_device *dev,
+static void ieee80211_sta_process_addba_request(struct ieee80211_local *local,
 						struct ieee80211_mgmt *mgmt,
 						size_t len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_hw *hw = &local->hw;
 	struct ieee80211_conf *conf = &hw->conf;
 	struct sta_info *sta;
@@ -1426,16 +1416,15 @@ end:
 	spin_unlock_bh(&sta->lock);
 
 end_no_lock:
-	ieee80211_send_addba_resp(sta->sdata->dev, sta->addr, tid,
+	ieee80211_send_addba_resp(sta->sdata, sta->addr, tid,
 				  dialog_token, status, 1, buf_size, timeout);
 	rcu_read_unlock();
 }
 
-static void ieee80211_sta_process_addba_resp(struct net_device *dev,
+static void ieee80211_sta_process_addba_resp(struct ieee80211_local *local,
 					     struct ieee80211_mgmt *mgmt,
 					     size_t len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_hw *hw = &local->hw;
 	struct sta_info *sta;
 	u16 capab;
@@ -1497,11 +1486,10 @@ addba_resp_exit:
 	rcu_read_unlock();
 }
 
-void ieee80211_send_delba(struct net_device *dev, const u8 *da, u16 tid,
+void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid,
 			  u16 initiator, u16 reason_code)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
@@ -1511,7 +1499,7 @@ void ieee80211_send_delba(struct net_device *dev, const u8 *da, u16 tid,
 
 	if (!skb) {
 		printk(KERN_ERR "%s: failed to allocate buffer "
-					"for delba frame\n", dev->name);
+					"for delba frame\n", sdata->dev->name);
 		return;
 	}
 
@@ -1519,9 +1507,9 @@ void ieee80211_send_delba(struct net_device *dev, const u8 *da, u16 tid,
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
-		memcpy(mgmt->bssid, dev->dev_addr, ETH_ALEN);
+		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
@@ -1537,12 +1525,12 @@ void ieee80211_send_delba(struct net_device *dev, const u8 *da, u16 tid,
 	mgmt->u.action.u.delba.params = cpu_to_le16(params);
 	mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-void ieee80211_send_bar(struct net_device *dev, u8 *ra, u16 tid, u16 ssn)
+void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_bar *bar;
 	u16 bar_control = 0;
@@ -1550,7 +1538,7 @@ void ieee80211_send_bar(struct net_device *dev, u8 *ra, u16 tid, u16 ssn)
 	skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom);
 	if (!skb) {
 		printk(KERN_ERR "%s: failed to allocate buffer for "
-			"bar frame\n", dev->name);
+			"bar frame\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
@@ -1559,20 +1547,20 @@ void ieee80211_send_bar(struct net_device *dev, u8 *ra, u16 tid, u16 ssn)
 	bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
 					 IEEE80211_STYPE_BACK_REQ);
 	memcpy(bar->ra, ra, ETH_ALEN);
-	memcpy(bar->ta, dev->dev_addr, ETH_ALEN);
+	memcpy(bar->ta, sdata->dev->dev_addr, ETH_ALEN);
 	bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL;
 	bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA;
 	bar_control |= (u16)(tid << 12);
 	bar->control = cpu_to_le16(bar_control);
 	bar->start_seq_num = cpu_to_le16(ssn);
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-void ieee80211_sta_stop_rx_ba_session(struct net_device *dev, u8 *ra, u16 tid,
+void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid,
 					u16 initiator, u16 reason)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_hw *hw = &local->hw;
 	struct sta_info *sta;
 	int ret, i;
@@ -1620,7 +1608,7 @@ void ieee80211_sta_stop_rx_ba_session(struct net_device *dev, u8 *ra, u16 tid,
 
 	/* check if this is a self generated aggregation halt */
 	if (initiator == WLAN_BACK_RECIPIENT || initiator == WLAN_BACK_TIMER)
-		ieee80211_send_delba(dev, ra, tid, 0, reason);
+		ieee80211_send_delba(sdata, ra, tid, 0, reason);
 
 	/* free the reordering buffer */
 	for (i = 0; i < sta->ampdu_mlme.tid_rx[tid]->buf_size; i++) {
@@ -1641,10 +1629,10 @@ void ieee80211_sta_stop_rx_ba_session(struct net_device *dev, u8 *ra, u16 tid,
 }
 
 
-static void ieee80211_sta_process_delba(struct net_device *dev,
+static void ieee80211_sta_process_delba(struct ieee80211_sub_if_data *sdata,
 			struct ieee80211_mgmt *mgmt, size_t len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	u16 tid, params;
 	u16 initiator;
@@ -1671,7 +1659,7 @@ static void ieee80211_sta_process_delba(struct net_device *dev,
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
 	if (initiator == WLAN_BACK_INITIATOR)
-		ieee80211_sta_stop_rx_ba_session(dev, sta->addr, tid,
+		ieee80211_sta_stop_rx_ba_session(sdata, sta->addr, tid,
 						 WLAN_BACK_INITIATOR, 0);
 	else { /* WLAN_BACK_RECIPIENT */
 		spin_lock_bh(&sta->lock);
@@ -1758,31 +1746,31 @@ static void sta_rx_agg_session_timer_expired(unsigned long data)
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
 #endif
-	ieee80211_sta_stop_rx_ba_session(sta->sdata->dev, sta->addr,
+	ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
 					 (u16)*ptid, WLAN_BACK_TIMER,
 					 WLAN_REASON_QSTA_TIMEOUT);
 }
 
-void ieee80211_sta_tear_down_BA_sessions(struct net_device *dev, u8 *addr)
+void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	int i;
 
 	for (i = 0; i <  STA_TID_NUM; i++) {
 		ieee80211_stop_tx_ba_session(&local->hw, addr, i,
 					     WLAN_BACK_INITIATOR);
-		ieee80211_sta_stop_rx_ba_session(dev, addr, i,
+		ieee80211_sta_stop_rx_ba_session(sdata, addr, i,
 						 WLAN_BACK_RECIPIENT,
 						 WLAN_REASON_QSTA_LEAVE_QBSS);
 	}
 }
 
-static void ieee80211_send_refuse_measurement_request(struct net_device *dev,
+static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
 					struct ieee80211_msrment_ie *request_ie,
 					const u8 *da, const u8 *bssid,
 					u8 dialog_token)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *msr_report;
 
@@ -1791,7 +1779,7 @@ static void ieee80211_send_refuse_measurement_request(struct net_device *dev,
 
 	if (!skb) {
 		printk(KERN_ERR "%s: failed to allocate buffer for "
-				"measurement report frame\n", dev->name);
+				"measurement report frame\n", sdata->dev->name);
 		return;
 	}
 
@@ -1799,7 +1787,7 @@ static void ieee80211_send_refuse_measurement_request(struct net_device *dev,
 	msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24);
 	memset(msr_report, 0, 24);
 	memcpy(msr_report->da, da, ETH_ALEN);
-	memcpy(msr_report->sa, dev->dev_addr, ETH_ALEN);
+	memcpy(msr_report->sa, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(msr_report->bssid, bssid, ETH_ALEN);
 	msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 						IEEE80211_STYPE_ACTION);
@@ -1821,10 +1809,10 @@ static void ieee80211_send_refuse_measurement_request(struct net_device *dev,
 			IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
 	msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
 
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-static void ieee80211_sta_process_measurement_req(struct net_device *dev,
+static void ieee80211_sta_process_measurement_req(struct ieee80211_sub_if_data *sdata,
 						struct ieee80211_mgmt *mgmt,
 						size_t len)
 {
@@ -1835,19 +1823,18 @@ static void ieee80211_sta_process_measurement_req(struct net_device *dev,
 	 * For now just refuse
 	 * TODO: Answer basic measurement as unmeasured
 	 */
-	ieee80211_send_refuse_measurement_request(dev,
+	ieee80211_send_refuse_measurement_request(sdata,
 			&mgmt->u.action.u.measurement.msr_elem,
 			mgmt->sa, mgmt->bssid,
 			mgmt->u.action.u.measurement.dialog_token);
 }
 
 
-static void ieee80211_rx_mgmt_auth(struct net_device *dev,
+static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta,
 				   struct ieee80211_mgmt *mgmt,
 				   size_t len)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	u16 auth_alg, auth_transaction, status_code;
 	DECLARE_MAC_BUF(mac);
 
@@ -1879,7 +1866,7 @@ static void ieee80211_rx_mgmt_auth(struct net_device *dev,
 		 */
 		if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1)
 			return;
-		ieee80211_send_auth(dev, ifsta, 2, NULL, 0, 0);
+		ieee80211_send_auth(sdata, ifsta, 2, NULL, 0, 0);
 	}
 
 	if (auth_alg != ifsta->auth_alg ||
@@ -1912,7 +1899,7 @@ static void ieee80211_rx_mgmt_auth(struct net_device *dev,
 				    algs[pos] == 0xff)
 					continue;
 				if (algs[pos] == WLAN_AUTH_SHARED_KEY &&
-				    !ieee80211_sta_wep_configured(dev))
+				    !ieee80211_sta_wep_configured(sdata))
 					continue;
 				ifsta->auth_alg = algs[pos];
 				break;
@@ -1924,19 +1911,19 @@ static void ieee80211_rx_mgmt_auth(struct net_device *dev,
 	switch (ifsta->auth_alg) {
 	case WLAN_AUTH_OPEN:
 	case WLAN_AUTH_LEAP:
-		ieee80211_auth_completed(dev, ifsta);
+		ieee80211_auth_completed(sdata, ifsta);
 		break;
 	case WLAN_AUTH_SHARED_KEY:
 		if (ifsta->auth_transaction == 4)
-			ieee80211_auth_completed(dev, ifsta);
+			ieee80211_auth_completed(sdata, ifsta);
 		else
-			ieee80211_auth_challenge(dev, ifsta, mgmt, len);
+			ieee80211_auth_challenge(sdata, ifsta, mgmt, len);
 		break;
 	}
 }
 
 
-static void ieee80211_rx_mgmt_deauth(struct net_device *dev,
+static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len)
@@ -1953,7 +1940,7 @@ static void ieee80211_rx_mgmt_deauth(struct net_device *dev,
 	reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
 
 	if (ifsta->flags & IEEE80211_STA_AUTHENTICATED)
-		printk(KERN_DEBUG "%s: deauthenticated\n", dev->name);
+		printk(KERN_DEBUG "%s: deauthenticated\n", sdata->dev->name);
 
 	if (ifsta->state == IEEE80211_AUTHENTICATE ||
 	    ifsta->state == IEEE80211_ASSOCIATE ||
@@ -1963,12 +1950,12 @@ static void ieee80211_rx_mgmt_deauth(struct net_device *dev,
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
 
-	ieee80211_set_disassoc(dev, ifsta, 1);
+	ieee80211_set_disassoc(sdata, ifsta, 1);
 	ifsta->flags &= ~IEEE80211_STA_AUTHENTICATED;
 }
 
 
-static void ieee80211_rx_mgmt_disassoc(struct net_device *dev,
+static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
 				       struct ieee80211_if_sta *ifsta,
 				       struct ieee80211_mgmt *mgmt,
 				       size_t len)
@@ -1985,7 +1972,7 @@ static void ieee80211_rx_mgmt_disassoc(struct net_device *dev,
 	reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
 
 	if (ifsta->flags & IEEE80211_STA_ASSOCIATED)
-		printk(KERN_DEBUG "%s: disassociated\n", dev->name);
+		printk(KERN_DEBUG "%s: disassociated\n", sdata->dev->name);
 
 	if (ifsta->state == IEEE80211_ASSOCIATED) {
 		ifsta->state = IEEE80211_ASSOCIATE;
@@ -1993,7 +1980,7 @@ static void ieee80211_rx_mgmt_disassoc(struct net_device *dev,
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
 
-	ieee80211_set_disassoc(dev, ifsta, 0);
+	ieee80211_set_disassoc(sdata, ifsta, 0);
 }
 
 
@@ -2004,7 +1991,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 					 int reassoc)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct net_device *dev = sdata->dev;
 	struct ieee80211_supported_band *sband;
 	struct sta_info *sta;
 	u64 rates, basic_rates;
@@ -2034,12 +2020,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 
 	printk(KERN_DEBUG "%s: RX %sssocResp from %s (capab=0x%x "
 	       "status=%d aid=%d)\n",
-	       dev->name, reassoc ? "Rea" : "A", print_mac(mac, mgmt->sa),
+	       sdata->dev->name, reassoc ? "Rea" : "A", print_mac(mac, mgmt->sa),
 	       capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
 
 	if (status_code != WLAN_STATUS_SUCCESS) {
 		printk(KERN_DEBUG "%s: AP denied association (code=%d)\n",
-		       dev->name, status_code);
+		       sdata->dev->name, status_code);
 		/* if this was a reassociation, ensure we try a "full"
 		 * association next time. This works around some broken APs
 		 * which do not correctly reject reassociation requests. */
@@ -2049,7 +2035,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 
 	if ((aid & (BIT(15) | BIT(14))) != (BIT(15) | BIT(14)))
 		printk(KERN_DEBUG "%s: invalid aid value %d; bits 15:14 not "
-		       "set\n", dev->name, aid);
+		       "set\n", sdata->dev->name, aid);
 	aid &= ~(BIT(15) | BIT(14));
 
 	pos = mgmt->u.assoc_resp.variable;
@@ -2057,11 +2043,11 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 
 	if (!elems.supp_rates) {
 		printk(KERN_DEBUG "%s: no SuppRates element in AssocResp\n",
-		       dev->name);
+		       sdata->dev->name);
 		return;
 	}
 
-	printk(KERN_DEBUG "%s: associated\n", dev->name);
+	printk(KERN_DEBUG "%s: associated\n", sdata->dev->name);
 	ifsta->aid = aid;
 	ifsta->ap_capab = capab_info;
 
@@ -2082,11 +2068,11 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		sta = sta_info_alloc(sdata, ifsta->bssid, GFP_ATOMIC);
 		if (!sta) {
 			printk(KERN_DEBUG "%s: failed to alloc STA entry for"
-			       " the AP\n", dev->name);
+			       " the AP\n", sdata->dev->name);
 			rcu_read_unlock();
 			return;
 		}
-		bss = ieee80211_rx_bss_get(dev, ifsta->bssid,
+		bss = ieee80211_rx_bss_get(local, ifsta->bssid,
 					   local->hw.conf.channel->center_freq,
 					   ifsta->ssid, ifsta->ssid_len);
 		if (bss) {
@@ -2099,7 +2085,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		err = sta_info_insert(sta);
 		if (err) {
 			printk(KERN_DEBUG "%s: failed to insert STA entry for"
-			       " the AP (error %d)\n", dev->name, err);
+			       " the AP (error %d)\n", sdata->dev->name, err);
 			rcu_read_unlock();
 			return;
 		}
@@ -2179,7 +2165,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	if (elems.wmm_param) {
 		set_sta_flags(sta, WLAN_STA_WME);
 		rcu_read_unlock();
-		ieee80211_sta_wmm_params(dev, ifsta, elems.wmm_param,
+		ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
 					 elems.wmm_param_len);
 	} else
 		rcu_read_unlock();
@@ -2188,17 +2174,16 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	 * ieee80211_set_associated() will tell the driver */
 	bss_conf->aid = aid;
 	bss_conf->assoc_capability = capab_info;
-	ieee80211_set_associated(dev, ifsta, 1);
+	ieee80211_set_associated(sdata, ifsta, 1);
 
-	ieee80211_associated(dev, ifsta);
+	ieee80211_associated(sdata, ifsta);
 }
 
 
 /* Caller must hold local->sta_bss_lock */
-static void __ieee80211_rx_bss_hash_add(struct net_device *dev,
+static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local,
 					struct ieee80211_sta_bss *bss)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	u8 hash_idx;
 
 	if (bss_mesh_cfg(bss))
@@ -2234,10 +2219,10 @@ static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
 
 
 static struct ieee80211_sta_bss *
-ieee80211_rx_bss_add(struct net_device *dev, u8 *bssid, int freq,
+ieee80211_rx_bss_add(struct ieee80211_sub_if_data *sdata, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sta_bss *bss;
 
 	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
@@ -2255,16 +2240,15 @@ ieee80211_rx_bss_add(struct net_device *dev, u8 *bssid, int freq,
 	spin_lock_bh(&local->sta_bss_lock);
 	/* TODO: order by RSSI? */
 	list_add_tail(&bss->list, &local->sta_bss_list);
-	__ieee80211_rx_bss_hash_add(dev, bss);
+	__ieee80211_rx_bss_hash_add(local, bss);
 	spin_unlock_bh(&local->sta_bss_lock);
 	return bss;
 }
 
 static struct ieee80211_sta_bss *
-ieee80211_rx_bss_get(struct net_device *dev, u8 *bssid, int freq,
+ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_sta_bss *bss;
 
 	spin_lock_bh(&local->sta_bss_lock);
@@ -2286,10 +2270,9 @@ ieee80211_rx_bss_get(struct net_device *dev, u8 *bssid, int freq,
 
 #ifdef CONFIG_MAC80211_MESH
 static struct ieee80211_sta_bss *
-ieee80211_rx_mesh_bss_get(struct net_device *dev, u8 *mesh_id, int mesh_id_len,
+ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
 			  u8 *mesh_cfg, int freq)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_sta_bss *bss;
 
 	spin_lock_bh(&local->sta_bss_lock);
@@ -2311,10 +2294,9 @@ ieee80211_rx_mesh_bss_get(struct net_device *dev, u8 *mesh_id, int mesh_id_len,
 }
 
 static struct ieee80211_sta_bss *
-ieee80211_rx_mesh_bss_add(struct net_device *dev, u8 *mesh_id, int mesh_id_len,
+ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
 			  u8 *mesh_cfg, int mesh_config_len, int freq)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_sta_bss *bss;
 
 	if (mesh_config_len != MESH_CFG_LEN)
@@ -2348,7 +2330,7 @@ ieee80211_rx_mesh_bss_add(struct net_device *dev, u8 *mesh_id, int mesh_id_len,
 	spin_lock_bh(&local->sta_bss_lock);
 	/* TODO: order by RSSI? */
 	list_add_tail(&bss->list, &local->sta_bss_list);
-	__ieee80211_rx_bss_hash_add(dev, bss);
+	__ieee80211_rx_bss_hash_add(local, bss);
 	spin_unlock_bh(&local->sta_bss_lock);
 	return bss;
 }
@@ -2399,23 +2381,20 @@ void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local)
 }
 
 
-static int ieee80211_sta_join_ibss(struct net_device *dev,
+static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta,
 				   struct ieee80211_sta_bss *bss)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	int res, rates, i, j;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 	u8 *pos;
-	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_supported_band *sband;
 	union iwreq_data wrqu;
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
 	/* Remove possible STA entries from other IBSS networks. */
 	sta_info_flush_delayed(sdata);
 
@@ -2433,7 +2412,7 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 	sdata->drop_unencrypted = bss->capability &
 		WLAN_CAPABILITY_PRIVACY ? 1 : 0;
 
-	res = ieee80211_set_freq(dev, bss->freq);
+	res = ieee80211_set_freq(sdata, bss->freq);
 
 	if (res)
 		return res;
@@ -2449,7 +2428,7 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 						  IEEE80211_STYPE_PROBE_RESP);
 		memset(mgmt->da, 0xff, ETH_ALEN);
-		memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN);
+		memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
 		mgmt->u.beacon.beacon_int =
 			cpu_to_le16(local->hw.conf.beacon_int);
@@ -2506,14 +2485,14 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 	}
 	ifsta->supp_rates_bits[local->hw.conf.channel->band] = rates;
 
-	ieee80211_sta_def_wmm_params(dev, bss, 1);
+	ieee80211_sta_def_wmm_params(sdata, bss, 1);
 
 	ifsta->state = IEEE80211_IBSS_JOINED;
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
 
 	memset(&wrqu, 0, sizeof(wrqu));
 	memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN);
-	wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
 
 	return res;
 }
@@ -2555,35 +2534,34 @@ u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 }
 
 
-static void ieee80211_rx_bss_info(struct net_device *dev,
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_mgmt *mgmt,
 				  size_t len,
 				  struct ieee80211_rx_status *rx_status,
 				  struct ieee802_11_elems *elems,
 				  int beacon)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	int freq, clen;
 	struct ieee80211_sta_bss *bss;
 	struct sta_info *sta;
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	u64 beacon_timestamp, rx_timestamp;
 	struct ieee80211_channel *channel;
 	DECLARE_MAC_BUF(mac);
 	DECLARE_MAC_BUF(mac2);
 
-	if (!beacon && memcmp(mgmt->da, dev->dev_addr, ETH_ALEN))
+	if (!beacon && memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
 		return; /* ignore ProbeResp to foreign address */
 
 	beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) && elems->mesh_id &&
-	    elems->mesh_config && mesh_matches_local(elems, dev)) {
+	    elems->mesh_config && mesh_matches_local(elems, sdata)) {
 		u64 rates = ieee80211_sta_get_rates(local, elems,
 						rx_status->band);
 
-		mesh_neighbour_update(mgmt->sa, rates, dev,
-				      mesh_peer_accepts_plinks(elems, dev));
+		mesh_neighbour_update(mgmt->sa, rates, sdata,
+				      mesh_peer_accepts_plinks(elems));
 	}
 
 	rcu_read_lock();
@@ -2620,21 +2598,21 @@ static void ieee80211_rx_bss_info(struct net_device *dev,
 
 #ifdef CONFIG_MAC80211_MESH
 	if (elems->mesh_config)
-		bss = ieee80211_rx_mesh_bss_get(dev, elems->mesh_id,
+		bss = ieee80211_rx_mesh_bss_get(local, elems->mesh_id,
 				elems->mesh_id_len, elems->mesh_config, freq);
 	else
 #endif
-		bss = ieee80211_rx_bss_get(dev, mgmt->bssid, freq,
+		bss = ieee80211_rx_bss_get(local, mgmt->bssid, freq,
 					   elems->ssid, elems->ssid_len);
 	if (!bss) {
 #ifdef CONFIG_MAC80211_MESH
 		if (elems->mesh_config)
-			bss = ieee80211_rx_mesh_bss_add(dev, elems->mesh_id,
+			bss = ieee80211_rx_mesh_bss_add(local, elems->mesh_id,
 				elems->mesh_id_len, elems->mesh_config,
 				elems->mesh_config_len, freq);
 		else
 #endif
-			bss = ieee80211_rx_bss_add(dev, mgmt->bssid, freq,
+			bss = ieee80211_rx_bss_add(sdata, mgmt->bssid, freq,
 						  elems->ssid, elems->ssid_len);
 		if (!bss)
 			return;
@@ -2871,10 +2849,10 @@ static void ieee80211_rx_bss_info(struct net_device *dev,
 #ifndef CONFIG_MAC80211_IBSS_DEBUG
 			printk(KERN_DEBUG "%s: beacon TSF higher than "
 			       "local TSF - IBSS merge with BSSID %s\n",
-			       dev->name, print_mac(mac, mgmt->bssid));
+			       sdata->dev->name, print_mac(mac, mgmt->bssid));
 #endif
-			ieee80211_sta_join_ibss(dev, &sdata->u.sta, bss);
-			ieee80211_ibss_add_sta(dev, NULL,
+			ieee80211_sta_join_ibss(sdata, &sdata->u.sta, bss);
+			ieee80211_ibss_add_sta(sdata, NULL,
 					       mgmt->bssid, mgmt->sa,
 					       BIT(rx_status->rate_idx));
 		}
@@ -2884,7 +2862,7 @@ static void ieee80211_rx_bss_info(struct net_device *dev,
 }
 
 
-static void ieee80211_rx_mgmt_probe_resp(struct net_device *dev,
+static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_mgmt *mgmt,
 					 size_t len,
 					 struct ieee80211_rx_status *rx_status)
@@ -2899,20 +2877,19 @@ static void ieee80211_rx_mgmt_probe_resp(struct net_device *dev,
 	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
 				&elems);
 
-	ieee80211_rx_bss_info(dev, mgmt, len, rx_status, &elems, 0);
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, 0);
 }
 
 
-static void ieee80211_rx_mgmt_beacon(struct net_device *dev,
+static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len,
 				     struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_if_sta *ifsta;
 	size_t baselen;
 	struct ieee802_11_elems elems;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_conf *conf = &local->hw.conf;
 	u32 changed = 0;
 
@@ -2923,9 +2900,8 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev,
 
 	ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems);
 
-	ieee80211_rx_bss_info(dev, mgmt, len, rx_status, &elems, 1);
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, 1);
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 		return;
 	ifsta = &sdata->u.sta;
@@ -2934,7 +2910,7 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev,
 	    memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
 		return;
 
-	ieee80211_sta_wmm_params(dev, ifsta, elems.wmm_param,
+	ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
 				 elems.wmm_param_len);
 
 	/* Do not send changes to driver if we are scanning. This removes
@@ -2966,14 +2942,13 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev,
 }
 
 
-static void ieee80211_rx_mgmt_probe_req(struct net_device *dev,
+static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
 					struct ieee80211_if_sta *ifsta,
 					struct ieee80211_mgmt *mgmt,
 					size_t len,
 					struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 	int tx_last_beacon;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *resp;
@@ -2997,7 +2972,7 @@ static void ieee80211_rx_mgmt_probe_req(struct net_device *dev,
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 	printk(KERN_DEBUG "%s: RX ProbeReq SA=%s DA=%s BSSID="
 	       "%s (tx_last_beacon=%d)\n",
-	       dev->name, print_mac(mac, mgmt->sa), print_mac(mac2, mgmt->da),
+	       sdata->dev->name, print_mac(mac, mgmt->sa), print_mac(mac2, mgmt->da),
 	       print_mac(mac3, mgmt->bssid), tx_last_beacon);
 #endif /* CONFIG_MAC80211_IBSS_DEBUG */
 
@@ -3015,7 +2990,7 @@ static void ieee80211_rx_mgmt_probe_req(struct net_device *dev,
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 		printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
 		       "from %s\n",
-		       dev->name, print_mac(mac, mgmt->sa));
+		       sdata->dev->name, print_mac(mac, mgmt->sa));
 #endif
 		return;
 	}
@@ -3035,19 +3010,18 @@ static void ieee80211_rx_mgmt_probe_req(struct net_device *dev,
 	memcpy(resp->da, mgmt->sa, ETH_ALEN);
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 	printk(KERN_DEBUG "%s: Sending ProbeResp to %s\n",
-	       dev->name, print_mac(mac, resp->da));
+	       sdata->dev->name, print_mac(mac, resp->da));
 #endif /* CONFIG_MAC80211_IBSS_DEBUG */
-	ieee80211_sta_tx(dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-static void ieee80211_rx_mgmt_action(struct net_device *dev,
+static void ieee80211_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len,
 				     struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 
 	if (len < IEEE80211_MIN_ACTION_SIZE)
 		return;
@@ -3061,7 +3035,7 @@ static void ieee80211_rx_mgmt_action(struct net_device *dev,
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.measurement)))
 				break;
-			ieee80211_sta_process_measurement_req(dev, mgmt, len);
+			ieee80211_sta_process_measurement_req(sdata, mgmt, len);
 			break;
 		}
 		break;
@@ -3071,38 +3045,37 @@ static void ieee80211_rx_mgmt_action(struct net_device *dev,
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.addba_req)))
 				break;
-			ieee80211_sta_process_addba_request(dev, mgmt, len);
+			ieee80211_sta_process_addba_request(local, mgmt, len);
 			break;
 		case WLAN_ACTION_ADDBA_RESP:
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.addba_resp)))
 				break;
-			ieee80211_sta_process_addba_resp(dev, mgmt, len);
+			ieee80211_sta_process_addba_resp(local, mgmt, len);
 			break;
 		case WLAN_ACTION_DELBA:
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.delba)))
 				break;
-			ieee80211_sta_process_delba(dev, mgmt, len);
+			ieee80211_sta_process_delba(sdata, mgmt, len);
 			break;
 		}
 		break;
 	case PLINK_CATEGORY:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
-			mesh_rx_plink_frame(dev, mgmt, len, rx_status);
+			mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
 		break;
 	case MESH_PATH_SEL_CATEGORY:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
-			mesh_rx_path_sel_frame(dev, mgmt, len);
+			mesh_rx_path_sel_frame(sdata, mgmt, len);
 		break;
 	}
 }
 
-void ieee80211_sta_rx_mgmt(struct net_device *dev, struct sk_buff *skb,
+void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 			   struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta;
 	struct ieee80211_mgmt *mgmt;
 	u16 fc;
@@ -3110,7 +3083,6 @@ void ieee80211_sta_rx_mgmt(struct net_device *dev, struct sk_buff *skb,
 	if (skb->len < 24)
 		goto fail;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	ifsta = &sdata->u.sta;
 
 	mgmt = (struct ieee80211_mgmt *) skb->data;
@@ -3137,16 +3109,14 @@ void ieee80211_sta_rx_mgmt(struct net_device *dev, struct sk_buff *skb,
 }
 
 
-static void ieee80211_sta_rx_queued_mgmt(struct net_device *dev,
+static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 					 struct sk_buff *skb)
 {
 	struct ieee80211_rx_status *rx_status;
-	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_if_sta *ifsta;
 	struct ieee80211_mgmt *mgmt;
 	u16 fc;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	ifsta = &sdata->u.sta;
 
 	rx_status = (struct ieee80211_rx_status *) skb->cb;
@@ -3155,17 +3125,17 @@ static void ieee80211_sta_rx_queued_mgmt(struct net_device *dev,
 
 	switch (fc & IEEE80211_FCTL_STYPE) {
 	case IEEE80211_STYPE_PROBE_REQ:
-		ieee80211_rx_mgmt_probe_req(dev, ifsta, mgmt, skb->len,
+		ieee80211_rx_mgmt_probe_req(sdata, ifsta, mgmt, skb->len,
 					    rx_status);
 		break;
 	case IEEE80211_STYPE_PROBE_RESP:
-		ieee80211_rx_mgmt_probe_resp(dev, mgmt, skb->len, rx_status);
+		ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, rx_status);
 		break;
 	case IEEE80211_STYPE_BEACON:
-		ieee80211_rx_mgmt_beacon(dev, mgmt, skb->len, rx_status);
+		ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, rx_status);
 		break;
 	case IEEE80211_STYPE_AUTH:
-		ieee80211_rx_mgmt_auth(dev, ifsta, mgmt, skb->len);
+		ieee80211_rx_mgmt_auth(sdata, ifsta, mgmt, skb->len);
 		break;
 	case IEEE80211_STYPE_ASSOC_RESP:
 		ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, skb->len, 0);
@@ -3174,13 +3144,13 @@ static void ieee80211_sta_rx_queued_mgmt(struct net_device *dev,
 		ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, skb->len, 1);
 		break;
 	case IEEE80211_STYPE_DEAUTH:
-		ieee80211_rx_mgmt_deauth(dev, ifsta, mgmt, skb->len);
+		ieee80211_rx_mgmt_deauth(sdata, ifsta, mgmt, skb->len);
 		break;
 	case IEEE80211_STYPE_DISASSOC:
-		ieee80211_rx_mgmt_disassoc(dev, ifsta, mgmt, skb->len);
+		ieee80211_rx_mgmt_disassoc(sdata, ifsta, mgmt, skb->len);
 		break;
 	case IEEE80211_STYPE_ACTION:
-		ieee80211_rx_mgmt_action(dev, ifsta, mgmt, skb->len, rx_status);
+		ieee80211_rx_mgmt_action(sdata, ifsta, mgmt, skb->len, rx_status);
 		break;
 	}
 
@@ -3189,7 +3159,7 @@ static void ieee80211_sta_rx_queued_mgmt(struct net_device *dev,
 
 
 ieee80211_rx_result
-ieee80211_sta_rx_scan(struct net_device *dev, struct sk_buff *skb,
+ieee80211_sta_rx_scan(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      struct ieee80211_rx_status *rx_status)
 {
 	struct ieee80211_mgmt *mgmt;
@@ -3208,13 +3178,13 @@ ieee80211_sta_rx_scan(struct net_device *dev, struct sk_buff *skb,
 		return RX_DROP_MONITOR;
 
 	if (ieee80211_is_probe_resp(fc)) {
-		ieee80211_rx_mgmt_probe_resp(dev, mgmt, skb->len, rx_status);
+		ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, rx_status);
 		dev_kfree_skb(skb);
 		return RX_QUEUED;
 	}
 
 	if (ieee80211_is_beacon(fc)) {
-		ieee80211_rx_mgmt_beacon(dev, mgmt, skb->len, rx_status);
+		ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, rx_status);
 		dev_kfree_skb(skb);
 		return RX_QUEUED;
 	}
@@ -3223,12 +3193,11 @@ ieee80211_sta_rx_scan(struct net_device *dev, struct sk_buff *skb,
 }
 
 
-static int ieee80211_sta_active_ibss(struct net_device *dev)
+static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	int active = 0;
 	struct sta_info *sta;
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	rcu_read_lock();
 
@@ -3247,9 +3216,9 @@ static int ieee80211_sta_active_ibss(struct net_device *dev)
 }
 
 
-static void ieee80211_sta_expire(struct net_device *dev, unsigned long exp_time)
+static void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta, *tmp;
 	LIST_HEAD(tmp_list);
 	DECLARE_MAC_BUF(mac);
@@ -3260,7 +3229,7 @@ static void ieee80211_sta_expire(struct net_device *dev, unsigned long exp_time)
 		if (time_after(jiffies, sta->last_rx + exp_time)) {
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 			printk(KERN_DEBUG "%s: expiring inactive STA %s\n",
-			       dev->name, print_mac(mac, sta->addr));
+			       sdata->dev->name, print_mac(mac, sta->addr));
 #endif
 			__sta_info_unlink(&sta);
 			if (sta)
@@ -3273,30 +3242,29 @@ static void ieee80211_sta_expire(struct net_device *dev, unsigned long exp_time)
 }
 
 
-static void ieee80211_sta_merge_ibss(struct net_device *dev,
+static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
 
-	ieee80211_sta_expire(dev, IEEE80211_IBSS_INACTIVITY_LIMIT);
-	if (ieee80211_sta_active_ibss(dev))
+	ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
+	if (ieee80211_sta_active_ibss(sdata))
 		return;
 
 	printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
-	       "IBSS networks with same SSID (merge)\n", dev->name);
-	ieee80211_sta_req_scan(dev, ifsta->ssid, ifsta->ssid_len);
+	       "IBSS networks with same SSID (merge)\n", sdata->dev->name);
+	ieee80211_sta_req_scan(sdata, ifsta->ssid, ifsta->ssid_len);
 }
 
 
 #ifdef CONFIG_MAC80211_MESH
-static void ieee80211_mesh_housekeeping(struct net_device *dev,
+static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata,
 			   struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	bool free_plinks;
 
-	ieee80211_sta_expire(dev, IEEE80211_MESH_PEER_INACTIVITY_LIMIT);
-	mesh_path_expire(dev);
+	ieee80211_sta_expire(sdata, IEEE80211_MESH_PEER_INACTIVITY_LIMIT);
+	mesh_path_expire(sdata);
 
 	free_plinks = mesh_plink_availables(sdata);
 	if (free_plinks != sdata->u.sta.accepting_plinks)
@@ -3307,10 +3275,9 @@ static void ieee80211_mesh_housekeeping(struct net_device *dev,
 }
 
 
-void ieee80211_start_mesh(struct net_device *dev)
+void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_sta *ifsta;
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	ifsta = &sdata->u.sta;
 	ifsta->state = IEEE80211_MESH_UP;
 	ieee80211_sta_timer((unsigned long)sdata);
@@ -3324,7 +3291,7 @@ void ieee80211_sta_timer(unsigned long data)
 	struct ieee80211_sub_if_data *sdata =
 		(struct ieee80211_sub_if_data *) data;
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct ieee80211_local *local = wdev_priv(&sdata->wdev);
+	struct ieee80211_local *local = sdata->local;
 
 	set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
 	queue_work(local->hw.workqueue, &ifsta->work);
@@ -3334,12 +3301,11 @@ void ieee80211_sta_work(struct work_struct *work)
 {
 	struct ieee80211_sub_if_data *sdata =
 		container_of(work, struct ieee80211_sub_if_data, u.sta.work);
-	struct net_device *dev = sdata->dev;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta;
 	struct sk_buff *skb;
 
-	if (!netif_running(dev))
+	if (!netif_running(sdata->dev))
 		return;
 
 	if (local->sta_sw_scanning || local->sta_hw_scanning)
@@ -3352,27 +3318,27 @@ void ieee80211_sta_work(struct work_struct *work)
 	ifsta = &sdata->u.sta;
 
 	while ((skb = skb_dequeue(&ifsta->skb_queue)))
-		ieee80211_sta_rx_queued_mgmt(dev, skb);
+		ieee80211_sta_rx_queued_mgmt(sdata, skb);
 
 #ifdef CONFIG_MAC80211_MESH
 	if (ifsta->preq_queue_len &&
 	    time_after(jiffies,
 		       ifsta->last_preq + msecs_to_jiffies(ifsta->mshcfg.dot11MeshHWMPpreqMinInterval)))
-		mesh_path_start_discovery(dev);
+		mesh_path_start_discovery(sdata);
 #endif
 
 	if (ifsta->state != IEEE80211_AUTHENTICATE &&
 	    ifsta->state != IEEE80211_ASSOCIATE &&
 	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
 		if (ifsta->scan_ssid_len)
-			ieee80211_sta_start_scan(dev, ifsta->scan_ssid, ifsta->scan_ssid_len);
+			ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
 		else
-			ieee80211_sta_start_scan(dev, NULL, 0);
+			ieee80211_sta_start_scan(sdata, NULL, 0);
 		return;
 	}
 
 	if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) {
-		if (ieee80211_sta_config_auth(dev, ifsta))
+		if (ieee80211_sta_config_auth(sdata, ifsta))
 			return;
 		clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
 	} else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request))
@@ -3382,23 +3348,23 @@ void ieee80211_sta_work(struct work_struct *work)
 	case IEEE80211_DISABLED:
 		break;
 	case IEEE80211_AUTHENTICATE:
-		ieee80211_authenticate(dev, ifsta);
+		ieee80211_authenticate(sdata, ifsta);
 		break;
 	case IEEE80211_ASSOCIATE:
-		ieee80211_associate(dev, ifsta);
+		ieee80211_associate(sdata, ifsta);
 		break;
 	case IEEE80211_ASSOCIATED:
-		ieee80211_associated(dev, ifsta);
+		ieee80211_associated(sdata, ifsta);
 		break;
 	case IEEE80211_IBSS_SEARCH:
-		ieee80211_sta_find_ibss(dev, ifsta);
+		ieee80211_sta_find_ibss(sdata, ifsta);
 		break;
 	case IEEE80211_IBSS_JOINED:
-		ieee80211_sta_merge_ibss(dev, ifsta);
+		ieee80211_sta_merge_ibss(sdata, ifsta);
 		break;
 #ifdef CONFIG_MAC80211_MESH
 	case IEEE80211_MESH_UP:
-		ieee80211_mesh_housekeeping(dev, ifsta);
+		ieee80211_mesh_housekeeping(sdata, ifsta);
 		break;
 #endif
 	default:
@@ -3406,20 +3372,20 @@ void ieee80211_sta_work(struct work_struct *work)
 		break;
 	}
 
-	if (ieee80211_privacy_mismatch(dev, ifsta)) {
+	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
 		printk(KERN_DEBUG "%s: privacy configuration mismatch and "
-		       "mixed-cell disabled - disassociate\n", dev->name);
+		       "mixed-cell disabled - disassociate\n", sdata->dev->name);
 
-		ieee80211_send_disassoc(dev, ifsta, WLAN_REASON_UNSPECIFIED);
-		ieee80211_set_disassoc(dev, ifsta, 0);
+		ieee80211_send_disassoc(sdata, ifsta, WLAN_REASON_UNSPECIFIED);
+		ieee80211_set_disassoc(sdata, ifsta, 0);
 	}
 }
 
 
-static void ieee80211_sta_reset_auth(struct net_device *dev,
+static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 
 	if (local->ops->reset_tsf) {
 		/* Reset own TSF to allow time synchronization work. */
@@ -3440,15 +3406,14 @@ static void ieee80211_sta_reset_auth(struct net_device *dev,
 	ifsta->auth_transaction = -1;
 	ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
 	ifsta->auth_tries = ifsta->assoc_tries = 0;
-	netif_carrier_off(dev);
+	netif_carrier_off(sdata->dev);
 }
 
 
-void ieee80211_sta_req_auth(struct net_device *dev,
+void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
 			    struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 		return;
@@ -3492,11 +3457,10 @@ static int ieee80211_sta_match_ssid(struct ieee80211_if_sta *ifsta,
 	return 0;
 }
 
-static int ieee80211_sta_config_auth(struct net_device *dev,
+static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sta_bss *bss, *selected = NULL;
 	int top_rssi = 0, freq;
 
@@ -3535,22 +3499,22 @@ static int ieee80211_sta_config_auth(struct net_device *dev,
 	spin_unlock_bh(&local->sta_bss_lock);
 
 	if (selected) {
-		ieee80211_set_freq(dev, selected->freq);
+		ieee80211_set_freq(sdata, selected->freq);
 		if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
-			ieee80211_sta_set_ssid(dev, selected->ssid,
+			ieee80211_sta_set_ssid(sdata, selected->ssid,
 					       selected->ssid_len);
-		ieee80211_sta_set_bssid(dev, selected->bssid);
-		ieee80211_sta_def_wmm_params(dev, selected, 0);
+		ieee80211_sta_set_bssid(sdata, selected->bssid);
+		ieee80211_sta_def_wmm_params(sdata, selected, 0);
 		ieee80211_rx_bss_put(local, selected);
 		ifsta->state = IEEE80211_AUTHENTICATE;
-		ieee80211_sta_reset_auth(dev, ifsta);
+		ieee80211_sta_reset_auth(sdata, ifsta);
 		return 0;
 	} else {
 		if (ifsta->state != IEEE80211_AUTHENTICATE) {
 			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
-				ieee80211_sta_start_scan(dev, NULL, 0);
+				ieee80211_sta_start_scan(sdata, NULL, 0);
 			else
-				ieee80211_sta_start_scan(dev, ifsta->ssid,
+				ieee80211_sta_start_scan(sdata, ifsta->ssid,
 							 ifsta->ssid_len);
 			ifsta->state = IEEE80211_AUTHENTICATE;
 			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
@@ -3561,12 +3525,11 @@ static int ieee80211_sta_config_auth(struct net_device *dev,
 }
 
 
-static int ieee80211_sta_create_ibss(struct net_device *dev,
+static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sta_bss *bss;
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_supported_band *sband;
 	u8 bssid[ETH_ALEN], *pos;
 	int i;
@@ -3582,15 +3545,15 @@ static int ieee80211_sta_create_ibss(struct net_device *dev,
 	 * random number generator get different BSSID. */
 	get_random_bytes(bssid, ETH_ALEN);
 	for (i = 0; i < ETH_ALEN; i++)
-		bssid[i] ^= dev->dev_addr[i];
+		bssid[i] ^= sdata->dev->dev_addr[i];
 	bssid[0] &= ~0x01;
 	bssid[0] |= 0x02;
 #endif
 
 	printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %s\n",
-	       dev->name, print_mac(mac, bssid));
+	       sdata->dev->name, print_mac(mac, bssid));
 
-	bss = ieee80211_rx_bss_add(dev, bssid,
+	bss = ieee80211_rx_bss_add(sdata, bssid,
 				   local->hw.conf.channel->center_freq,
 				   sdata->u.sta.ssid, sdata->u.sta.ssid_len);
 	if (!bss)
@@ -3617,16 +3580,16 @@ static int ieee80211_sta_create_ibss(struct net_device *dev,
 		*pos++ = (u8) (rate / 5);
 	}
 
-	ret = ieee80211_sta_join_ibss(dev, ifsta, bss);
+	ret = ieee80211_sta_join_ibss(sdata, ifsta, bss);
 	ieee80211_rx_bss_put(local, bss);
 	return ret;
 }
 
 
-static int ieee80211_sta_find_ibss(struct net_device *dev,
+static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sta_bss *bss;
 	int found = 0;
 	u8 bssid[ETH_ALEN];
@@ -3637,10 +3600,10 @@ static int ieee80211_sta_find_ibss(struct net_device *dev,
 	if (ifsta->ssid_len == 0)
 		return -EINVAL;
 
-	active_ibss = ieee80211_sta_active_ibss(dev);
+	active_ibss = ieee80211_sta_active_ibss(sdata);
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 	printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
-	       dev->name, active_ibss);
+	       sdata->dev->name, active_ibss);
 #endif /* CONFIG_MAC80211_IBSS_DEBUG */
 	spin_lock_bh(&local->sta_bss_lock);
 	list_for_each_entry(bss, &local->sta_bss_list, list) {
@@ -3675,15 +3638,15 @@ static int ieee80211_sta_find_ibss(struct net_device *dev,
 		else
 			search_freq = local->hw.conf.channel->center_freq;
 
-		bss = ieee80211_rx_bss_get(dev, bssid, search_freq,
+		bss = ieee80211_rx_bss_get(local, bssid, search_freq,
 					   ifsta->ssid, ifsta->ssid_len);
 		if (!bss)
 			goto dont_join;
 
 		printk(KERN_DEBUG "%s: Selected IBSS BSSID %s"
 		       " based on configured SSID\n",
-		       dev->name, print_mac(mac, bssid));
-		ret = ieee80211_sta_join_ibss(dev, ifsta, bss);
+		       sdata->dev->name, print_mac(mac, bssid));
+		ret = ieee80211_sta_join_ibss(sdata, ifsta, bss);
 		ieee80211_rx_bss_put(local, bss);
 		return ret;
 	}
@@ -3695,14 +3658,14 @@ dont_join:
 
 	/* Selected IBSS not found in current scan results - try to scan */
 	if (ifsta->state == IEEE80211_IBSS_JOINED &&
-	    !ieee80211_sta_active_ibss(dev)) {
+	    !ieee80211_sta_active_ibss(sdata)) {
 		mod_timer(&ifsta->timer, jiffies +
 				      IEEE80211_IBSS_MERGE_INTERVAL);
 	} else if (time_after(jiffies, local->last_scan_completed +
 			      IEEE80211_SCAN_INTERVAL)) {
 		printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
-		       "join\n", dev->name);
-		return ieee80211_sta_req_scan(dev, ifsta->ssid,
+		       "join\n", sdata->dev->name);
+		return ieee80211_sta_req_scan(sdata, ifsta->ssid,
 					      ifsta->ssid_len);
 	} else if (ifsta->state != IEEE80211_IBSS_JOINED) {
 		int interval = IEEE80211_SCAN_INTERVAL;
@@ -3712,10 +3675,10 @@ dont_join:
 			if ((ifsta->flags & IEEE80211_STA_CREATE_IBSS) &&
 			    (!(local->oper_channel->flags &
 					IEEE80211_CHAN_NO_IBSS)))
-				return ieee80211_sta_create_ibss(dev, ifsta);
+				return ieee80211_sta_create_ibss(sdata, ifsta);
 			if (ifsta->flags & IEEE80211_STA_CREATE_IBSS) {
 				printk(KERN_DEBUG "%s: IBSS not allowed on"
-				       " %d MHz\n", dev->name,
+				       " %d MHz\n", sdata->dev->name,
 				       local->hw.conf.channel->center_freq);
 			}
 
@@ -3733,9 +3696,8 @@ dont_join:
 }
 
 
-int ieee80211_sta_set_ssid(struct net_device *dev, char *ssid, size_t len)
+int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta;
 	int res;
 
@@ -3759,7 +3721,7 @@ int ieee80211_sta_set_ssid(struct net_device *dev, char *ssid, size_t len)
 			res = ieee80211_if_config(sdata, IEEE80211_IFCC_SSID);
 		if (res) {
 			printk(KERN_DEBUG "%s: Failed to config new SSID to "
-			       "the low-level driver\n", dev->name);
+			       "the low-level driver\n", sdata->dev->name);
 			return res;
 		}
 	}
@@ -3773,16 +3735,15 @@ int ieee80211_sta_set_ssid(struct net_device *dev, char *ssid, size_t len)
 	    !(ifsta->flags & IEEE80211_STA_BSSID_SET)) {
 		ifsta->ibss_join_req = jiffies;
 		ifsta->state = IEEE80211_IBSS_SEARCH;
-		return ieee80211_sta_find_ibss(dev, ifsta);
+		return ieee80211_sta_find_ibss(sdata, ifsta);
 	}
 
 	return 0;
 }
 
 
-int ieee80211_sta_get_ssid(struct net_device *dev, char *ssid, size_t *len)
+int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	memcpy(ssid, ifsta->ssid, ifsta->ssid_len);
 	*len = ifsta->ssid_len;
@@ -3790,13 +3751,11 @@ int ieee80211_sta_get_ssid(struct net_device *dev, char *ssid, size_t *len)
 }
 
 
-int ieee80211_sta_set_bssid(struct net_device *dev, u8 *bssid)
+int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
 {
-	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_if_sta *ifsta;
 	int res;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	ifsta = &sdata->u.sta;
 
 	if (memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0) {
@@ -3809,7 +3768,7 @@ int ieee80211_sta_set_bssid(struct net_device *dev, u8 *bssid)
 			res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
 		if (res) {
 			printk(KERN_DEBUG "%s: Failed to config new BSSID to "
-			       "the low-level driver\n", dev->name);
+			       "the low-level driver\n", sdata->dev->name);
 			return res;
 		}
 	}
@@ -3850,7 +3809,7 @@ static void ieee80211_send_nullfunc(struct ieee80211_local *local,
 	memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN);
 
-	ieee80211_sta_tx(sdata->dev, skb, 0);
+	ieee80211_sta_tx(sdata, skb, 0);
 }
 
 
@@ -3923,8 +3882,8 @@ done:
 		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
 		    (!(ifsta->state == IEEE80211_IBSS_JOINED) &&
-		    !ieee80211_sta_active_ibss(dev)))
-			ieee80211_sta_find_ibss(dev, ifsta);
+		    !ieee80211_sta_active_ibss(sdata)))
+			ieee80211_sta_find_ibss(sdata, ifsta);
 	}
 }
 EXPORT_SYMBOL(ieee80211_scan_completed);
@@ -4013,7 +3972,7 @@ void ieee80211_sta_scan_work(struct work_struct *work)
 
 		if (local->scan_channel->flags & IEEE80211_CHAN_PASSIVE_SCAN)
 			break;
-		ieee80211_send_probe_req(dev, NULL, local->scan_ssid,
+		ieee80211_send_probe_req(sdata, NULL, local->scan_ssid,
 					 local->scan_ssid_len);
 		next_delay = IEEE80211_CHANNEL_TIME;
 		break;
@@ -4025,10 +3984,10 @@ void ieee80211_sta_scan_work(struct work_struct *work)
 }
 
 
-static int ieee80211_sta_start_scan(struct net_device *dev,
+static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 				    u8 *ssid, size_t ssid_len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = scan_sdata->local;
 	struct ieee80211_sub_if_data *sdata;
 
 	if (ssid_len > IEEE80211_MAX_SSID_LEN)
@@ -4052,7 +4011,7 @@ static int ieee80211_sta_start_scan(struct net_device *dev,
 	 */
 
 	if (local->sta_sw_scanning || local->sta_hw_scanning) {
-		if (local->scan_dev == dev)
+		if (local->scan_dev == scan_sdata->dev)
 			return 0;
 		return -EBUSY;
 	}
@@ -4062,7 +4021,7 @@ static int ieee80211_sta_start_scan(struct net_device *dev,
 					     ssid, ssid_len);
 		if (!rc) {
 			local->sta_hw_scanning = 1;
-			local->scan_dev = dev;
+			local->scan_dev = scan_sdata->dev;
 		}
 		return rc;
 	}
@@ -4086,7 +4045,7 @@ static int ieee80211_sta_start_scan(struct net_device *dev,
 	local->scan_state = SCAN_SET_CHANNEL;
 	local->scan_channel_idx = 0;
 	local->scan_band = IEEE80211_BAND_2GHZ;
-	local->scan_dev = dev;
+	local->scan_dev = scan_sdata->dev;
 
 	netif_addr_lock_bh(local->mdev);
 	local->filter_flags |= FIF_BCN_PRBRESP_PROMISC;
@@ -4105,17 +4064,16 @@ static int ieee80211_sta_start_scan(struct net_device *dev,
 }
 
 
-int ieee80211_sta_req_scan(struct net_device *dev, u8 *ssid, size_t ssid_len)
+int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
-		return ieee80211_sta_start_scan(dev, ssid, ssid_len);
+		return ieee80211_sta_start_scan(sdata, ssid, ssid_len);
 
 	if (local->sta_sw_scanning || local->sta_hw_scanning) {
-		if (local->scan_dev == dev)
+		if (local->scan_dev == sdata->dev)
 			return 0;
 		return -EBUSY;
 	}
@@ -4129,12 +4087,11 @@ int ieee80211_sta_req_scan(struct net_device *dev, u8 *ssid, size_t ssid_len)
 }
 
 static char *
-ieee80211_sta_scan_result(struct net_device *dev,
+ieee80211_sta_scan_result(struct ieee80211_local *local,
 			  struct iw_request_info *info,
 			  struct ieee80211_sta_bss *bss,
 			  char *current_ev, char *end_buf)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct iw_event iwe;
 
 	if (time_after(jiffies,
@@ -4321,11 +4278,10 @@ ieee80211_sta_scan_result(struct net_device *dev,
 }
 
 
-int ieee80211_sta_scan_results(struct net_device *dev,
+int ieee80211_sta_scan_results(struct ieee80211_local *local,
 			       struct iw_request_info *info,
 			       char *buf, size_t len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	char *current_ev = buf;
 	char *end_buf = buf + len;
 	struct ieee80211_sta_bss *bss;
@@ -4336,7 +4292,7 @@ int ieee80211_sta_scan_results(struct net_device *dev,
 			spin_unlock_bh(&local->sta_bss_lock);
 			return -E2BIG;
 		}
-		current_ev = ieee80211_sta_scan_result(dev, info, bss,
+		current_ev = ieee80211_sta_scan_result(local, info, bss,
 						       current_ev, end_buf);
 	}
 	spin_unlock_bh(&local->sta_bss_lock);
@@ -4344,9 +4300,8 @@ int ieee80211_sta_scan_results(struct net_device *dev,
 }
 
 
-int ieee80211_sta_set_extra_ie(struct net_device *dev, char *ie, size_t len)
+int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 
 	kfree(ifsta->extra_ie);
@@ -4366,13 +4321,12 @@ int ieee80211_sta_set_extra_ie(struct net_device *dev, char *ie, size_t len)
 }
 
 
-struct sta_info *ieee80211_ibss_add_sta(struct net_device *dev,
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 					struct sk_buff *skb, u8 *bssid,
 					u8 *addr, u64 supp_rates)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	DECLARE_MAC_BUF(mac);
 	int band = local->hw.conf.channel->band;
 
@@ -4381,7 +4335,7 @@ struct sta_info *ieee80211_ibss_add_sta(struct net_device *dev,
 	if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
 		if (net_ratelimit()) {
 			printk(KERN_DEBUG "%s: No room for a new IBSS STA "
-			       "entry %s\n", dev->name, print_mac(mac, addr));
+			       "entry %s\n", sdata->dev->name, print_mac(mac, addr));
 		}
 		return NULL;
 	}
@@ -4391,7 +4345,7 @@ struct sta_info *ieee80211_ibss_add_sta(struct net_device *dev,
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	printk(KERN_DEBUG "%s: Adding new IBSS station %s (dev=%s)\n",
-	       wiphy_name(local->hw.wiphy), print_mac(mac, addr), dev->name);
+	       wiphy_name(local->hw.wiphy), print_mac(mac, addr), sdata->dev->name);
 #endif
 
 	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
@@ -4414,31 +4368,29 @@ struct sta_info *ieee80211_ibss_add_sta(struct net_device *dev,
 }
 
 
-int ieee80211_sta_deauthenticate(struct net_device *dev, u16 reason)
+int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 
 	printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n",
-	       dev->name, reason);
+	       sdata->dev->name, reason);
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
 	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
 		return -EINVAL;
 
-	ieee80211_send_deauth(dev, ifsta, reason);
-	ieee80211_set_disassoc(dev, ifsta, 1);
+	ieee80211_send_deauth(sdata, ifsta, reason);
+	ieee80211_set_disassoc(sdata, ifsta, 1);
 	return 0;
 }
 
 
-int ieee80211_sta_disassociate(struct net_device *dev, u16 reason)
+int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 
 	printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n",
-	       dev->name, reason);
+	       sdata->dev->name, reason);
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 		return -EINVAL;
@@ -4446,8 +4398,8 @@ int ieee80211_sta_disassociate(struct net_device *dev, u16 reason)
 	if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED))
 		return -1;
 
-	ieee80211_send_disassoc(dev, ifsta, reason);
-	ieee80211_set_disassoc(dev, ifsta, 0);
+	ieee80211_send_disassoc(sdata, ifsta, reason);
+	ieee80211_set_disassoc(sdata, ifsta, 0);
 	return 0;
 }
 
@@ -4464,7 +4416,7 @@ void ieee80211_notify_mac(struct ieee80211_hw *hw,
 			if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 				continue;
 
-			ieee80211_sta_req_auth(sdata->dev, &sdata->u.sta);
+			ieee80211_sta_req_auth(sdata, &sdata->u.sta);
 		}
 		rcu_read_unlock();
 		break;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 2464263cb7a..fd83ef760a3 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -404,11 +404,11 @@ ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx)
 	struct sk_buff *skb = rx->skb;
 
 	if (unlikely(local->sta_hw_scanning))
-		return ieee80211_sta_rx_scan(rx->dev, skb, rx->status);
+		return ieee80211_sta_rx_scan(rx->sdata, skb, rx->status);
 
 	if (unlikely(local->sta_sw_scanning)) {
 		/* drop all the other packets during a software scan anyway */
-		if (ieee80211_sta_rx_scan(rx->dev, skb, rx->status)
+		if (ieee80211_sta_rx_scan(rx->sdata, skb, rx->status)
 		    != RX_QUEUED)
 			dev_kfree_skb(skb);
 		return RX_QUEUED;
@@ -466,7 +466,7 @@ ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
 
 	if (ieee80211_is_data(hdr->frame_control) &&
 	    is_multicast_ether_addr(hdr->addr1) &&
-	    mesh_rmc_check(hdr->addr4, msh_h_get(hdr, hdrlen), rx->dev))
+	    mesh_rmc_check(hdr->addr4, msh_h_get(hdr, hdrlen), rx->sdata))
 		return RX_DROP_MONITOR;
 #undef msh_h_get
 
@@ -1523,7 +1523,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 	     sdata->vif.type == IEEE80211_IF_TYPE_IBSS ||
 	     sdata->vif.type == IEEE80211_IF_TYPE_MESH_POINT) &&
 	    !(sdata->flags & IEEE80211_SDATA_USERSPACE_MLME))
-		ieee80211_sta_rx_mgmt(rx->dev, rx->skb, rx->status);
+		ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
 	else
 		return RX_DROP_MONITOR;
 
@@ -1570,7 +1570,7 @@ static void ieee80211_rx_michael_mic_report(struct net_device *dev,
 	    !ieee80211_is_auth(hdr->frame_control))
 		goto ignore;
 
-	mac80211_ev_michael_mic_failure(rx->dev, keyidx, hdr);
+	mac80211_ev_michael_mic_failure(rx->sdata, keyidx, hdr);
  ignore:
 	dev_kfree_skb(rx->skb);
 	rx->skb = NULL;
@@ -1744,7 +1744,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 			return 0;
 		if (ieee80211_is_beacon(hdr->frame_control)) {
 			if (!rx->sta)
-				rx->sta = ieee80211_ibss_add_sta(sdata->dev,
+				rx->sta = ieee80211_ibss_add_sta(sdata,
 						rx->skb, bssid, hdr->addr2,
 						BIT(rx->status->rate_idx));
 			return 1;
@@ -1760,7 +1760,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 				return 0;
 			rx->flags &= ~IEEE80211_RX_RA_MATCH;
 		} else if (!rx->sta)
-			rx->sta = ieee80211_ibss_add_sta(sdata->dev, rx->skb,
+			rx->sta = ieee80211_ibss_add_sta(sdata, rx->skb,
 						bssid, hdr->addr2,
 						BIT(rx->status->rate_idx));
 		break;
@@ -2066,7 +2066,7 @@ static u8 ieee80211_rx_reorder_ampdu(struct ieee80211_local *local,
 	/* if this mpdu is fragmented - terminate rx aggregation session */
 	sc = le16_to_cpu(hdr->seq_ctrl);
 	if (sc & IEEE80211_SCTL_FRAG) {
-		ieee80211_sta_stop_rx_ba_session(sta->sdata->dev, sta->addr,
+		ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
 			tid, 0, WLAN_REASON_QSTA_REQUIRE_SETUP);
 		ret = 1;
 		goto end_reorder;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 85f3ba85c13..c413d4836af 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1327,7 +1327,7 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 			if (is_multicast_ether_addr(hdr->addr3))
 				memcpy(hdr->addr1, hdr->addr3, ETH_ALEN);
 			else
-				if (mesh_nexthop_lookup(skb, odev))
+				if (mesh_nexthop_lookup(skb, osdata))
 					return  0;
 			if (memcmp(odev->dev_addr, hdr->addr4, ETH_ALEN) != 0)
 				IEEE80211_IFSTA_MESH_CTR_INC(&osdata->u.sta,
@@ -1908,7 +1908,7 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 		*pos++ = WLAN_EID_SSID;
 		*pos++ = 0x0;
 
-		mesh_mgmt_ies_add(skb, sdata->dev);
+		mesh_mgmt_ies_add(skb, sdata);
 
 		num_beacons = &sdata->u.sta.num_beacons;
 	} else {
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 34fa8ed1e78..735daa355c3 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -27,22 +27,19 @@
 #include "aes_ccm.h"
 
 
-static int ieee80211_set_encryption(struct net_device *dev, u8 *sta_addr,
+static int ieee80211_set_encryption(struct ieee80211_sub_if_data *sdata, u8 *sta_addr,
 				    int idx, int alg, int remove,
 				    int set_tx_key, const u8 *_key,
 				    size_t key_len)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	struct ieee80211_key *key;
-	struct ieee80211_sub_if_data *sdata;
 	int err;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
 	if (idx < 0 || idx >= NUM_DEFAULT_KEYS) {
 		printk(KERN_DEBUG "%s: set_encrypt - invalid idx=%d\n",
-		       dev->name, idx);
+		       sdata->dev->name, idx);
 		return -EINVAL;
 	}
 
@@ -127,11 +124,11 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev,
 
 	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
 	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
-		int ret = ieee80211_sta_set_extra_ie(dev, extra, data->length);
+		int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length);
 		if (ret)
 			return ret;
 		sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
-		ieee80211_sta_req_auth(dev, &sdata->u.sta);
+		ieee80211_sta_req_auth(sdata, &sdata->u.sta);
 		return 0;
 	}
 
@@ -333,12 +330,11 @@ static int ieee80211_ioctl_giwmode(struct net_device *dev,
 	return 0;
 }
 
-int ieee80211_set_freq(struct net_device *dev, int freqMHz)
+int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
 {
 	int ret = -EINVAL;
 	struct ieee80211_channel *chan;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 
 	chan = ieee80211_get_channel(local->hw.wiphy, freqMHz);
 
@@ -346,7 +342,7 @@ int ieee80211_set_freq(struct net_device *dev, int freqMHz)
 		if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
 		    chan->flags & IEEE80211_CHAN_NO_IBSS) {
 			printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
-				"%d MHz\n", dev->name, chan->center_freq);
+				"%d MHz\n", sdata->dev->name, chan->center_freq);
 			return ret;
 		}
 		local->oper_channel = chan;
@@ -379,14 +375,14 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev,
 					IEEE80211_STA_AUTO_CHANNEL_SEL;
 			return 0;
 		} else
-			return ieee80211_set_freq(dev,
+			return ieee80211_set_freq(sdata,
 				ieee80211_channel_to_frequency(freq->m));
 	} else {
 		int i, div = 1000000;
 		for (i = 0; i < freq->e; i++)
 			div /= 10;
 		if (div > 0)
-			return ieee80211_set_freq(dev, freq->m / div);
+			return ieee80211_set_freq(sdata, freq->m / div);
 		else
 			return -EINVAL;
 	}
@@ -432,10 +428,10 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
 			sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
 		else
 			sdata->u.sta.flags |= IEEE80211_STA_AUTO_SSID_SEL;
-		ret = ieee80211_sta_set_ssid(dev, ssid, len);
+		ret = ieee80211_sta_set_ssid(sdata, ssid, len);
 		if (ret)
 			return ret;
-		ieee80211_sta_req_auth(dev, &sdata->u.sta);
+		ieee80211_sta_req_auth(sdata, &sdata->u.sta);
 		return 0;
 	}
 
@@ -460,7 +456,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
 	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
-		int res = ieee80211_sta_get_ssid(dev, ssid, &len);
+		int res = ieee80211_sta_get_ssid(sdata, ssid, &len);
 		if (res == 0) {
 			data->length = len;
 			data->flags = 1;
@@ -504,10 +500,10 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
 			sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL;
 		else
 			sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
-		ret = ieee80211_sta_set_bssid(dev, (u8 *) &ap_addr->sa_data);
+		ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
 		if (ret)
 			return ret;
-		ieee80211_sta_req_auth(dev, &sdata->u.sta);
+		ieee80211_sta_req_auth(sdata, &sdata->u.sta);
 		return 0;
 	} else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) {
 		/*
@@ -584,7 +580,7 @@ static int ieee80211_ioctl_siwscan(struct net_device *dev,
 		ssid_len = req->essid_len;
 	}
 
-	return ieee80211_sta_req_scan(dev, ssid, ssid_len);
+	return ieee80211_sta_req_scan(sdata, ssid, ssid_len);
 }
 
 
@@ -594,11 +590,14 @@ static int ieee80211_ioctl_giwscan(struct net_device *dev,
 {
 	int res;
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_sub_if_data *sdata;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (local->sta_sw_scanning || local->sta_hw_scanning)
 		return -EAGAIN;
 
-	res = ieee80211_sta_scan_results(dev, info, extra, data->length);
+	res = ieee80211_sta_scan_results(local, info, extra, data->length);
 	if (res >= 0) {
 		data->length = res;
 		return 0;
@@ -894,10 +893,10 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev,
 	switch (mlme->cmd) {
 	case IW_MLME_DEAUTH:
 		/* TODO: mlme->addr.sa_data */
-		return ieee80211_sta_deauthenticate(dev, mlme->reason_code);
+		return ieee80211_sta_deauthenticate(sdata, mlme->reason_code);
 	case IW_MLME_DISASSOC:
 		/* TODO: mlme->addr.sa_data */
-		return ieee80211_sta_disassociate(dev, mlme->reason_code);
+		return ieee80211_sta_disassociate(sdata, mlme->reason_code);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -938,7 +937,7 @@ static int ieee80211_ioctl_siwencode(struct net_device *dev,
 	}
 
 	return ieee80211_set_encryption(
-		dev, bcaddr,
+		sdata, bcaddr,
 		idx, alg, remove,
 		!sdata->default_key,
 		keybuf, erq->length);
@@ -1184,7 +1183,7 @@ static int ieee80211_ioctl_siwencodeext(struct net_device *dev,
 	} else
 		idx--;
 
-	return ieee80211_set_encryption(dev, ext->addr.sa_data, idx, alg,
+	return ieee80211_set_encryption(sdata, ext->addr.sa_data, idx, alg,
 					remove,
 					ext->ext_flags &
 					IW_ENCODE_EXT_SET_TX_KEY,
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 2f33df0dccc..78021780b88 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -127,7 +127,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
 		if (!(rx->flags & IEEE80211_RX_RA_MATCH))
 			return RX_DROP_UNUSABLE;
 
-		mac80211_ev_michael_mic_failure(rx->dev, rx->key->conf.keyidx,
+		mac80211_ev_michael_mic_failure(rx->sdata, rx->key->conf.keyidx,
 						(void *) skb->data);
 		return RX_DROP_UNUSABLE;
 	}
-- 
cgit v1.2.3


From 8e7cdbb6333ef7654e708bd60e50a123688dcd7b Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Sun, 3 Aug 2008 14:32:01 +0300
Subject: mac80211: filter probes in ieee80211_rx_mgmt_probe_resp

This patch moves filtering statement from ieee80211_rx_bss_info
which is called for both beacon and probe to ieee80211_rx_mgmt_probe_resp
and save few cycles in beacon parsing.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index fb8e1e742ca..ae5a5cbabae 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2550,9 +2550,6 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	DECLARE_MAC_BUF(mac);
 	DECLARE_MAC_BUF(mac2);
 
-	if (!beacon && memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
-		return; /* ignore ProbeResp to foreign address */
-
 	beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) && elems->mesh_id &&
@@ -2870,6 +2867,9 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 	size_t baselen;
 	struct ieee802_11_elems elems;
 
+	if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
+		return; /* ignore ProbeResp to foreign address */
+
 	baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
 	if (baselen > len)
 		return;
-- 
cgit v1.2.3


From 48c2fc59aa415ba92be0ad3a7e741c46883e3944 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Wed, 6 Aug 2008 14:22:01 +0300
Subject: mac80211: cleanup mlme state namespace

This patch move add STA_MLME to station mlme state defines.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h | 34 ++++++++++++++--------
 net/mac80211/main.c        |  2 +-
 net/mac80211/mlme.c        | 72 +++++++++++++++++++++++-----------------------
 net/mac80211/wext.c        |  4 +--
 4 files changed, 61 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b5d3f58a784..747814f319a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -292,17 +292,33 @@ struct mesh_config {
 #define IEEE80211_STA_AUTO_BSSID_SEL	BIT(11)
 #define IEEE80211_STA_AUTO_CHANNEL_SEL	BIT(12)
 #define IEEE80211_STA_PRIVACY_INVOKED	BIT(13)
+/* flags for  MLME request*/
+#define IEEE80211_STA_REQ_SCAN 0
+#define IEEE80211_STA_REQ_AUTH 1
+#define IEEE80211_STA_REQ_RUN  2
+
+/* flags used for setting mlme state */
+enum ieee80211_sta_mlme_state {
+	IEEE80211_STA_MLME_DISABLED,
+	IEEE80211_STA_MLME_AUTHENTICATE,
+	IEEE80211_STA_MLME_ASSOCIATE,
+	IEEE80211_STA_MLME_ASSOCIATED,
+	IEEE80211_STA_MLME_IBSS_SEARCH,
+	IEEE80211_STA_MLME_IBSS_JOINED,
+	IEEE80211_STA_MLME_MESH_UP
+};
+
+/* bitfield of allowed auth algs */
+#define IEEE80211_AUTH_ALG_OPEN BIT(0)
+#define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1)
+#define IEEE80211_AUTH_ALG_LEAP BIT(2)
+
 struct ieee80211_if_sta {
 	struct timer_list timer;
 	struct work_struct work;
 	u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
 	u8 ssid[IEEE80211_MAX_SSID_LEN];
-	enum {
-		IEEE80211_DISABLED, IEEE80211_AUTHENTICATE,
-		IEEE80211_ASSOCIATE, IEEE80211_ASSOCIATED,
-		IEEE80211_IBSS_SEARCH, IEEE80211_IBSS_JOINED,
-		IEEE80211_MESH_UP
-	} state;
+	enum ieee80211_sta_mlme_state state;
 	size_t ssid_len;
 	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
 	size_t scan_ssid_len;
@@ -352,13 +368,7 @@ struct ieee80211_if_sta {
 	unsigned long last_probe;
 
 	unsigned int flags;
-#define IEEE80211_STA_REQ_SCAN 0
-#define IEEE80211_STA_REQ_AUTH 1
-#define IEEE80211_STA_REQ_RUN  2
 
-#define IEEE80211_AUTH_ALG_OPEN BIT(0)
-#define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1)
-#define IEEE80211_AUTH_ALG_LEAP BIT(2)
 	unsigned int auth_algs; /* bitfield of allowed auth algs */
 	int auth_alg; /* currently used IEEE 802.11 authentication algorithm */
 	int auth_transaction;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 81963948665..398ca66bdfc 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -539,7 +539,7 @@ static int ieee80211_stop(struct net_device *dev)
 		/* fall through */
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
-		sdata->u.sta.state = IEEE80211_DISABLED;
+		sdata->u.sta.state = IEEE80211_STA_MLME_DISABLED;
 		memset(sdata->u.sta.bssid, 0, ETH_ALEN);
 		del_timer_sync(&sdata->u.sta.timer);
 		/*
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ae5a5cbabae..f05519d1c2c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -663,11 +663,11 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
 		printk(KERN_DEBUG "%s: authentication with AP %s"
 		       " timed out\n",
 		       sdata->dev->name, print_mac(mac, ifsta->bssid));
-		ifsta->state = IEEE80211_DISABLED;
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
 		return;
 	}
 
-	ifsta->state = IEEE80211_AUTHENTICATE;
+	ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 	printk(KERN_DEBUG "%s: authenticate with AP %s\n",
 	       sdata->dev->name, print_mac(mac, ifsta->bssid));
 
@@ -1004,17 +1004,17 @@ static void ieee80211_associate(struct ieee80211_sub_if_data *sdata,
 		printk(KERN_DEBUG "%s: association with AP %s"
 		       " timed out\n",
 		       sdata->dev->name, print_mac(mac, ifsta->bssid));
-		ifsta->state = IEEE80211_DISABLED;
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
 		return;
 	}
 
-	ifsta->state = IEEE80211_ASSOCIATE;
+	ifsta->state = IEEE80211_STA_MLME_ASSOCIATE;
 	printk(KERN_DEBUG "%s: associate with AP %s\n",
 	       sdata->dev->name, print_mac(mac, ifsta->bssid));
 	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
 		printk(KERN_DEBUG "%s: mismatch in privacy configuration and "
 		       "mixed-cell disabled - abort association\n", sdata->dev->name);
-		ifsta->state = IEEE80211_DISABLED;
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
 		return;
 	}
 
@@ -1037,7 +1037,7 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 	 * for better APs. */
 	/* TODO: remove expired BSSes */
 
-	ifsta->state = IEEE80211_ASSOCIATED;
+	ifsta->state = IEEE80211_STA_MLME_ASSOCIATED;
 
 	rcu_read_lock();
 
@@ -1080,7 +1080,7 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 		sta_info_destroy(sta);
 
 	if (disassoc) {
-		ifsta->state = IEEE80211_DISABLED;
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
 		ieee80211_set_associated(sdata, ifsta, 0);
 	} else {
 		mod_timer(&ifsta->timer, jiffies +
@@ -1838,7 +1838,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	u16 auth_alg, auth_transaction, status_code;
 	DECLARE_MAC_BUF(mac);
 
-	if (ifsta->state != IEEE80211_AUTHENTICATE &&
+	if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
 	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
 		return;
 
@@ -1942,10 +1942,10 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 	if (ifsta->flags & IEEE80211_STA_AUTHENTICATED)
 		printk(KERN_DEBUG "%s: deauthenticated\n", sdata->dev->name);
 
-	if (ifsta->state == IEEE80211_AUTHENTICATE ||
-	    ifsta->state == IEEE80211_ASSOCIATE ||
-	    ifsta->state == IEEE80211_ASSOCIATED) {
-		ifsta->state = IEEE80211_AUTHENTICATE;
+	if (ifsta->state == IEEE80211_STA_MLME_AUTHENTICATE ||
+	    ifsta->state == IEEE80211_STA_MLME_ASSOCIATE ||
+	    ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) {
+		ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 		mod_timer(&ifsta->timer, jiffies +
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
@@ -1974,8 +1974,8 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
 	if (ifsta->flags & IEEE80211_STA_ASSOCIATED)
 		printk(KERN_DEBUG "%s: disassociated\n", sdata->dev->name);
 
-	if (ifsta->state == IEEE80211_ASSOCIATED) {
-		ifsta->state = IEEE80211_ASSOCIATE;
+	if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) {
+		ifsta->state = IEEE80211_STA_MLME_ASSOCIATE;
 		mod_timer(&ifsta->timer, jiffies +
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
@@ -2005,7 +2005,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	/* AssocResp and ReassocResp have identical structure, so process both
 	 * of them in this function. */
 
-	if (ifsta->state != IEEE80211_ASSOCIATE)
+	if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATE)
 		return;
 
 	if (len < 24 + 6)
@@ -2487,7 +2487,7 @@ static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_sta_def_wmm_params(sdata, bss, 1);
 
-	ifsta->state = IEEE80211_IBSS_JOINED;
+	ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED;
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
 
 	memset(&wrqu, 0, sizeof(wrqu));
@@ -2960,7 +2960,7 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
 #endif
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS ||
-	    ifsta->state != IEEE80211_IBSS_JOINED ||
+	    ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED ||
 	    len < 24 + 2 || !ifsta->probe_resp)
 		return;
 
@@ -3279,7 +3279,7 @@ void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_sta *ifsta;
 	ifsta = &sdata->u.sta;
-	ifsta->state = IEEE80211_MESH_UP;
+	ifsta->state = IEEE80211_STA_MLME_MESH_UP;
 	ieee80211_sta_timer((unsigned long)sdata);
 	ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON);
 }
@@ -3327,8 +3327,8 @@ void ieee80211_sta_work(struct work_struct *work)
 		mesh_path_start_discovery(sdata);
 #endif
 
-	if (ifsta->state != IEEE80211_AUTHENTICATE &&
-	    ifsta->state != IEEE80211_ASSOCIATE &&
+	if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
+	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
 	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
 		if (ifsta->scan_ssid_len)
 			ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
@@ -3345,25 +3345,25 @@ void ieee80211_sta_work(struct work_struct *work)
 		return;
 
 	switch (ifsta->state) {
-	case IEEE80211_DISABLED:
+	case IEEE80211_STA_MLME_DISABLED:
 		break;
-	case IEEE80211_AUTHENTICATE:
+	case IEEE80211_STA_MLME_AUTHENTICATE:
 		ieee80211_authenticate(sdata, ifsta);
 		break;
-	case IEEE80211_ASSOCIATE:
+	case IEEE80211_STA_MLME_ASSOCIATE:
 		ieee80211_associate(sdata, ifsta);
 		break;
-	case IEEE80211_ASSOCIATED:
+	case IEEE80211_STA_MLME_ASSOCIATED:
 		ieee80211_associated(sdata, ifsta);
 		break;
-	case IEEE80211_IBSS_SEARCH:
+	case IEEE80211_STA_MLME_IBSS_SEARCH:
 		ieee80211_sta_find_ibss(sdata, ifsta);
 		break;
-	case IEEE80211_IBSS_JOINED:
+	case IEEE80211_STA_MLME_IBSS_JOINED:
 		ieee80211_sta_merge_ibss(sdata, ifsta);
 		break;
 #ifdef CONFIG_MAC80211_MESH
-	case IEEE80211_MESH_UP:
+	case IEEE80211_STA_MLME_MESH_UP:
 		ieee80211_mesh_housekeeping(sdata, ifsta);
 		break;
 #endif
@@ -3506,20 +3506,20 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 		ieee80211_sta_set_bssid(sdata, selected->bssid);
 		ieee80211_sta_def_wmm_params(sdata, selected, 0);
 		ieee80211_rx_bss_put(local, selected);
-		ifsta->state = IEEE80211_AUTHENTICATE;
+		ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 		ieee80211_sta_reset_auth(sdata, ifsta);
 		return 0;
 	} else {
-		if (ifsta->state != IEEE80211_AUTHENTICATE) {
+		if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE) {
 			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
 				ieee80211_sta_start_scan(sdata, NULL, 0);
 			else
 				ieee80211_sta_start_scan(sdata, ifsta->ssid,
 							 ifsta->ssid_len);
-			ifsta->state = IEEE80211_AUTHENTICATE;
+			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
 		} else
-			ifsta->state = IEEE80211_DISABLED;
+			ifsta->state = IEEE80211_STA_MLME_DISABLED;
 	}
 	return -1;
 }
@@ -3657,7 +3657,7 @@ dont_join:
 #endif /* CONFIG_MAC80211_IBSS_DEBUG */
 
 	/* Selected IBSS not found in current scan results - try to scan */
-	if (ifsta->state == IEEE80211_IBSS_JOINED &&
+	if (ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED &&
 	    !ieee80211_sta_active_ibss(sdata)) {
 		mod_timer(&ifsta->timer, jiffies +
 				      IEEE80211_IBSS_MERGE_INTERVAL);
@@ -3667,7 +3667,7 @@ dont_join:
 		       "join\n", sdata->dev->name);
 		return ieee80211_sta_req_scan(sdata, ifsta->ssid,
 					      ifsta->ssid_len);
-	} else if (ifsta->state != IEEE80211_IBSS_JOINED) {
+	} else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) {
 		int interval = IEEE80211_SCAN_INTERVAL;
 
 		if (time_after(jiffies, ifsta->ibss_join_req +
@@ -3687,7 +3687,7 @@ dont_join:
 			interval = IEEE80211_SCAN_INTERVAL_SLOW;
 		}
 
-		ifsta->state = IEEE80211_IBSS_SEARCH;
+		ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
 		mod_timer(&ifsta->timer, jiffies + interval);
 		return 0;
 	}
@@ -3734,7 +3734,7 @@ int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
 	    !(ifsta->flags & IEEE80211_STA_BSSID_SET)) {
 		ifsta->ibss_join_req = jiffies;
-		ifsta->state = IEEE80211_IBSS_SEARCH;
+		ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
 		return ieee80211_sta_find_ibss(sdata, ifsta);
 	}
 
@@ -3881,7 +3881,7 @@ done:
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
 		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
-		    (!(ifsta->state == IEEE80211_IBSS_JOINED) &&
+		    (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) &&
 		    !ieee80211_sta_active_ibss(sdata)))
 			ieee80211_sta_find_ibss(sdata, ifsta);
 	}
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 735daa355c3..beae664ab48 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -535,8 +535,8 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
 	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
-		if (sdata->u.sta.state == IEEE80211_ASSOCIATED ||
-		    sdata->u.sta.state == IEEE80211_IBSS_JOINED) {
+		if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATED ||
+		    sdata->u.sta.state == IEEE80211_STA_MLME_IBSS_JOINED) {
 			ap_addr->sa_family = ARPHRD_ETHER;
 			memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
 			return 0;
-- 
cgit v1.2.3


From 6042a3e3ff7943e4ff5cbcb8c223ea87337501ea Mon Sep 17 00:00:00 2001
From: Ron Rindjunsky <ron.rindjunsky@intel.com>
Date: Fri, 8 Aug 2008 01:50:46 +0300
Subject: mac80211: change number of pre-assoc scans

This patch fixes noticed problem in noisy environments of 50+ APs
that scan fails to find the requested AP on first try, which
leads to connection refusal. second scan has empirically proven to fix
this problem in almost all cases.

Signed-off-by: Ron Rindjunsky <ron.rindjunsky@intel.com>
Signed-off-by: Esti Kummer <ester.kummer@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h | 4 +++-
 net/mac80211/mlme.c        | 9 +++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 747814f319a..c6de3156930 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -361,7 +361,9 @@ struct ieee80211_if_sta {
 
 	struct sk_buff_head skb_queue;
 
-	int auth_tries, assoc_tries;
+	int assoc_scan_tries; /* number of scans done pre-association */
+	int auth_tries; /* retries for auth req */
+	int assoc_tries; /* retries for assoc req */
 
 	unsigned long request;
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f05519d1c2c..e821d1ac5b0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -34,6 +34,7 @@
 #include "led.h"
 #include "mesh.h"
 
+#define IEEE80211_ASSOC_SCANS_MAX_TRIES 2
 #define IEEE80211_AUTH_TIMEOUT (HZ / 5)
 #define IEEE80211_AUTH_MAX_TRIES 3
 #define IEEE80211_ASSOC_TIMEOUT (HZ / 5)
@@ -596,6 +597,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 {
 	if (deauth)
 		ifsta->auth_tries = 0;
+	ifsta->assoc_scan_tries = 0;
 	ifsta->assoc_tries = 0;
 	ieee80211_set_associated(sdata, ifsta, 0);
 }
@@ -3405,7 +3407,9 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
 		ifsta->auth_alg = WLAN_AUTH_OPEN;
 	ifsta->auth_transaction = -1;
 	ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
-	ifsta->auth_tries = ifsta->assoc_tries = 0;
+	ifsta->assoc_scan_tries = 0;
+	ifsta->auth_tries = 0;
+	ifsta->assoc_tries = 0;
 	netif_carrier_off(sdata->dev);
 }
 
@@ -3510,7 +3514,8 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 		ieee80211_sta_reset_auth(sdata, ifsta);
 		return 0;
 	} else {
-		if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE) {
+		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
+			ifsta->assoc_scan_tries++;
 			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
 				ieee80211_sta_start_scan(sdata, NULL, 0);
 			else
-- 
cgit v1.2.3


From 9859b81eaeb8d48563b5fbd90215c0ae606455a3 Mon Sep 17 00:00:00 2001
From: Ron Rindjunsky <ron.rindjunsky@intel.com>
Date: Sat, 9 Aug 2008 03:02:19 +0300
Subject: mac80211: add direct probe before association

This patch adds a direct probe request as first step in the association
flow if data we have is not up to date. Motivation of this step is to make
sure that the bss information we have is correct, since last scan could
have been done a while ago, and beacons do not fully answer this need as
there are potential differences between them and probe responses (e.g.
WMM parameter element)

Signed-off-by: Ron Rindjunsky <ron.rindjunsky@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  9 ++++--
 net/mac80211/mlme.c        | 79 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 73 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c6de3156930..8361054fb7c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -100,7 +100,7 @@ struct ieee80211_sta_bss {
 	u64 timestamp;
 	int beacon_int;
 
-	bool probe_resp;
+	unsigned long last_probe_resp;
 	unsigned long last_update;
 
 	/* during assocation, we save an ERP value from a probe response so
@@ -294,12 +294,14 @@ struct mesh_config {
 #define IEEE80211_STA_PRIVACY_INVOKED	BIT(13)
 /* flags for  MLME request*/
 #define IEEE80211_STA_REQ_SCAN 0
-#define IEEE80211_STA_REQ_AUTH 1
-#define IEEE80211_STA_REQ_RUN  2
+#define IEEE80211_STA_REQ_DIRECT_PROBE 1
+#define IEEE80211_STA_REQ_AUTH 2
+#define IEEE80211_STA_REQ_RUN  3
 
 /* flags used for setting mlme state */
 enum ieee80211_sta_mlme_state {
 	IEEE80211_STA_MLME_DISABLED,
+	IEEE80211_STA_MLME_DIRECT_PROBE,
 	IEEE80211_STA_MLME_AUTHENTICATE,
 	IEEE80211_STA_MLME_ASSOCIATE,
 	IEEE80211_STA_MLME_ASSOCIATED,
@@ -362,6 +364,7 @@ struct ieee80211_if_sta {
 	struct sk_buff_head skb_queue;
 
 	int assoc_scan_tries; /* number of scans done pre-association */
+	int direct_probe_tries; /* retries for direct probes */
 	int auth_tries; /* retries for auth req */
 	int assoc_tries; /* retries for assoc req */
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e821d1ac5b0..84999791a33 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -595,8 +595,10 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta, int deauth)
 {
-	if (deauth)
+	if (deauth) {
+		ifsta->direct_probe_tries = 0;
 		ifsta->auth_tries = 0;
+	}
 	ifsta->assoc_scan_tries = 0;
 	ifsta->assoc_tries = 0;
 	ieee80211_set_associated(sdata, ifsta, 0);
@@ -654,6 +656,36 @@ static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	ieee80211_sta_tx(sdata, skb, encrypt);
 }
 
+static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_if_sta *ifsta)
+{
+	DECLARE_MAC_BUF(mac);
+
+	ifsta->direct_probe_tries++;
+	if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
+		printk(KERN_DEBUG "%s: direct probe to AP %s timed out\n",
+		       sdata->dev->name, print_mac(mac, ifsta->bssid));
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
+		return;
+	}
+
+	printk(KERN_DEBUG "%s: direct probe to AP %s try %d\n",
+			sdata->dev->name, print_mac(mac, ifsta->bssid),
+			ifsta->direct_probe_tries);
+
+	ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+
+	set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request);
+
+	/* Direct probe is sent to broadcast address as some APs
+	 * will not answer to direct packet in unassociated state.
+	 */
+	ieee80211_send_probe_req(sdata, NULL,
+				 ifsta->ssid, ifsta->ssid_len);
+
+	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
+}
+
 
 static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta)
@@ -1947,7 +1979,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 	if (ifsta->state == IEEE80211_STA_MLME_AUTHENTICATE ||
 	    ifsta->state == IEEE80211_STA_MLME_ASSOCIATE ||
 	    ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) {
-		ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+		ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
 		mod_timer(&ifsta->timer, jiffies +
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
@@ -2540,8 +2572,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_mgmt *mgmt,
 				  size_t len,
 				  struct ieee80211_rx_status *rx_status,
-				  struct ieee802_11_elems *elems,
-				  int beacon)
+				  struct ieee802_11_elems *elems)
 {
 	struct ieee80211_local *local = sdata->local;
 	int freq, clen;
@@ -2549,6 +2580,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	struct sta_info *sta;
 	u64 beacon_timestamp, rx_timestamp;
 	struct ieee80211_channel *channel;
+	bool beacon = ieee80211_is_beacon(mgmt->frame_control);
 	DECLARE_MAC_BUF(mac);
 	DECLARE_MAC_BUF(mac2);
 
@@ -2705,15 +2737,14 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	bss->signal = rx_status->signal;
 	bss->noise = rx_status->noise;
 	bss->qual = rx_status->qual;
-	if (!beacon && !bss->probe_resp)
-		bss->probe_resp = true;
-
+	if (!beacon)
+		bss->last_probe_resp = jiffies;
 	/*
 	 * In STA mode, the remaining parameters should not be overridden
 	 * by beacons because they're not necessarily accurate there.
 	 */
 	if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
-	    bss->probe_resp && beacon) {
+	    bss->last_probe_resp && beacon) {
 		ieee80211_rx_bss_put(local, bss);
 		return;
 	}
@@ -2868,6 +2899,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 {
 	size_t baselen;
 	struct ieee802_11_elems elems;
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 
 	if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
 		return; /* ignore ProbeResp to foreign address */
@@ -2879,7 +2911,15 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
 				&elems);
 
-	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, 0);
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+
+	/* direct probe may be part of the association flow */
+	if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE,
+							&ifsta->request)) {
+		printk(KERN_DEBUG "%s direct probe responded\n",
+		       sdata->dev->name);
+		ieee80211_authenticate(sdata, ifsta);
+	}
 }
 
 
@@ -2902,7 +2942,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 
 	ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems);
 
-	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, 1);
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 		return;
@@ -3329,7 +3369,8 @@ void ieee80211_sta_work(struct work_struct *work)
 		mesh_path_start_discovery(sdata);
 #endif
 
-	if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
+	if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
+	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
 	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
 	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
 		if (ifsta->scan_ssid_len)
@@ -3349,6 +3390,9 @@ void ieee80211_sta_work(struct work_struct *work)
 	switch (ifsta->state) {
 	case IEEE80211_STA_MLME_DISABLED:
 		break;
+	case IEEE80211_STA_MLME_DIRECT_PROBE:
+		ieee80211_direct_probe(sdata, ifsta);
+		break;
 	case IEEE80211_STA_MLME_AUTHENTICATE:
 		ieee80211_authenticate(sdata, ifsta);
 		break;
@@ -3408,6 +3452,7 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
 	ifsta->auth_transaction = -1;
 	ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
 	ifsta->assoc_scan_tries = 0;
+	ifsta->direct_probe_tries = 0;
 	ifsta->auth_tries = 0;
 	ifsta->assoc_tries = 0;
 	netif_carrier_off(sdata->dev);
@@ -3509,8 +3554,18 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 					       selected->ssid_len);
 		ieee80211_sta_set_bssid(sdata, selected->bssid);
 		ieee80211_sta_def_wmm_params(sdata, selected, 0);
+
+		/* Send out direct probe if no probe resp was received or
+		 * the one we have is outdated
+		 */
+		if (!selected->last_probe_resp ||
+		    time_after(jiffies, selected->last_probe_resp
+					+ IEEE80211_SCAN_RESULT_EXPIRE))
+			ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+		else
+			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+
 		ieee80211_rx_bss_put(local, selected);
-		ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 		ieee80211_sta_reset_auth(sdata, ifsta);
 		return 0;
 	} else {
-- 
cgit v1.2.3


From cbe2d128a01315fb4bd55b96cf8b963f5df28ea2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 23 Aug 2008 05:10:12 -0700
Subject: tcp: Add tcp_validate_incoming & put duplicated code there
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Large block of code duplication removed.

Sadly, the return value thing is a bit tricky here but it
seems the most sensible way to return positive from validator
on success rather than negative.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 147 ++++++++++++++++++++++++---------------------------
 1 file changed, 69 insertions(+), 78 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 67ccce2a96b..e1b15d4e617 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4691,6 +4691,67 @@ out:
 }
 #endif /* CONFIG_NET_DMA */
 
+/* Does PAWS and seqno based validation of an incoming segment, flags will
+ * play significant role here.
+ */
+static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+			      struct tcphdr *th, int syn_inerr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* RFC1323: H1. Apply PAWS check first. */
+	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+	    tcp_paws_discard(sk, skb)) {
+		if (!th->rst) {
+			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+			tcp_send_dupack(sk, skb);
+			goto discard;
+		}
+		/* Reset is accepted even if it did not pass PAWS. */
+	}
+
+	/* Step 1: check sequence number */
+	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+		/* RFC793, page 37: "In all states except SYN-SENT, all reset
+		 * (RST) segments are validated by checking their SEQ-fields."
+		 * And page 69: "If an incoming segment is not acceptable,
+		 * an acknowledgment should be sent in reply (unless the RST
+		 * bit is set, if so drop the segment and return)".
+		 */
+		if (!th->rst)
+			tcp_send_dupack(sk, skb);
+		goto discard;
+	}
+
+	/* Step 2: check RST bit */
+	if (th->rst) {
+		tcp_reset(sk);
+		goto discard;
+	}
+
+	/* ts_recent update must be made after we are sure that the packet
+	 * is in window.
+	 */
+	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+	/* step 3: check security and precedence [ignored] */
+
+	/* step 4: Check for a SYN in window. */
+	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		if (syn_inerr)
+			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
+		tcp_reset(sk);
+		return -1;
+	}
+
+	return 1;
+
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+
 /*
  *	TCP receive function for the ESTABLISHED state.
  *
@@ -4718,6 +4779,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			struct tcphdr *th, unsigned len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	int res;
 
 	/*
 	 *	Header prediction.
@@ -4898,52 +4960,13 @@ slow_path:
 	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
 		goto csum_error;
 
-	/*
-	 * RFC1323: H1. Apply PAWS check first.
-	 */
-	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-	    tcp_paws_discard(sk, skb)) {
-		if (!th->rst) {
-			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-			tcp_send_dupack(sk, skb);
-			goto discard;
-		}
-		/* Resets are accepted even if PAWS failed.
-
-		   ts_recent update must be made after we are sure
-		   that the packet is in window.
-		 */
-	}
-
 	/*
 	 *	Standard slow path.
 	 */
 
-	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
-		/* RFC793, page 37: "In all states except SYN-SENT, all reset
-		 * (RST) segments are validated by checking their SEQ-fields."
-		 * And page 69: "If an incoming segment is not acceptable,
-		 * an acknowledgment should be sent in reply (unless the RST bit
-		 * is set, if so drop the segment and return)".
-		 */
-		if (!th->rst)
-			tcp_send_dupack(sk, skb);
-		goto discard;
-	}
-
-	if (th->rst) {
-		tcp_reset(sk);
-		goto discard;
-	}
-
-	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
-	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
-		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
-		tcp_reset(sk);
-		return 1;
-	}
+	res = tcp_validate_incoming(sk, skb, th, 1);
+	if (res <= 0)
+		return -res;
 
 step5:
 	if (th->ack)
@@ -5225,6 +5248,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int queued = 0;
+	int res;
 
 	tp->rx_opt.saw_tstamp = 0;
 
@@ -5277,42 +5301,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		return 0;
 	}
 
-	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-	    tcp_paws_discard(sk, skb)) {
-		if (!th->rst) {
-			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-			tcp_send_dupack(sk, skb);
-			goto discard;
-		}
-		/* Reset is accepted even if it did not pass PAWS. */
-	}
-
-	/* step 1: check sequence number */
-	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
-		if (!th->rst)
-			tcp_send_dupack(sk, skb);
-		goto discard;
-	}
-
-	/* step 2: check RST bit */
-	if (th->rst) {
-		tcp_reset(sk);
-		goto discard;
-	}
-
-	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
-	/* step 3: check security and precedence [ignored] */
-
-	/*	step 4:
-	 *
-	 *	Check for a SYN in window.
-	 */
-	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
-		tcp_reset(sk);
-		return 1;
-	}
+	res = tcp_validate_incoming(sk, skb, th, 0);
+	if (res <= 0)
+		return -res;
 
 	/* step 5: check the ACK field */
 	if (th->ack) {
-- 
cgit v1.2.3


From 2cf46637b501794d7fe9e365f0a3046f5d1f5dfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 23 Aug 2008 05:11:41 -0700
Subject: tcp: Add tcp_collapse_one to eliminate duplicated code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e1b15d4e617..580f9547ddf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4161,6 +4161,18 @@ add_sack:
 	}
 }
 
+static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+					struct sk_buff_head *list)
+{
+	struct sk_buff *next = skb->next;
+
+	__skb_unlink(skb, list);
+	__kfree_skb(skb);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+
+	return next;
+}
+
 /* Collapse contiguous sequence of skbs head..tail with
  * sequence numbers start..end.
  * Segments with FIN/SYN are not collapsed (only because this
@@ -4178,11 +4190,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 	for (skb = head; skb != tail;) {
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-			struct sk_buff *next = skb->next;
-			__skb_unlink(skb, list);
-			__kfree_skb(skb);
-			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
-			skb = next;
+			skb = tcp_collapse_one(sk, skb, list);
 			continue;
 		}
 
@@ -4246,11 +4254,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 				start += size;
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-				struct sk_buff *next = skb->next;
-				__skb_unlink(skb, list);
-				__kfree_skb(skb);
-				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
-				skb = next;
+				skb = tcp_collapse_one(sk, skb, list);
 				if (skb == tail ||
 				    tcp_hdr(skb)->syn ||
 				    tcp_hdr(skb)->fin)
-- 
cgit v1.2.3


From a4356b2920fd4861dd6c75f558749fa5c38a00e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 23 Aug 2008 05:12:29 -0700
Subject: tcp: Add tcp_parse_aligned_timestamp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some duplicated code lying around. Located with my suffix tree
tool.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 580f9547ddf..f79a5160729 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3442,6 +3442,22 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 	}
 }
 
+static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
+{
+	__be32 *ptr = (__be32 *)(th + 1);
+
+	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+		tp->rx_opt.saw_tstamp = 1;
+		++ptr;
+		tp->rx_opt.rcv_tsval = ntohl(*ptr);
+		++ptr;
+		tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+		return 1;
+	}
+	return 0;
+}
+
 /* Fast parse options. This hopes to only see timestamps.
  * If it is wrong it falls back on tcp_parse_options().
  */
@@ -3453,16 +3469,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
 		return 0;
 	} else if (tp->rx_opt.tstamp_ok &&
 		   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
-		__be32 *ptr = (__be32 *)(th + 1);
-		if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
-				  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
-			tp->rx_opt.saw_tstamp = 1;
-			++ptr;
-			tp->rx_opt.rcv_tsval = ntohl(*ptr);
-			++ptr;
-			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+		if (tcp_parse_aligned_timestamp(tp, th))
 			return 1;
-		}
 	}
 	tcp_parse_options(skb, &tp->rx_opt, 1);
 	return 1;
@@ -4822,19 +4830,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 		/* Check timestamp */
 		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
-			__be32 *ptr = (__be32 *)(th + 1);
-
 			/* No? Slow path! */
-			if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
-					  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+			if (!tcp_parse_aligned_timestamp(tp, th))
 				goto slow_path;
 
-			tp->rx_opt.saw_tstamp = 1;
-			++ptr;
-			tp->rx_opt.rcv_tsval = ntohl(*ptr);
-			++ptr;
-			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
-
 			/* If PAWS failed, check it more carefully in slow path */
 			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
 				goto slow_path;
-- 
cgit v1.2.3


From 409a19669e4cd8d1bab7dff31d3b6aa493ff60f0 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Fri, 22 Aug 2008 14:06:12 +0200
Subject: IPVS: Integrate ESP protocol into ip_vs_proto_ah.c

Rename all ah_* functions to ah_esp_* (and adjust comments). Move ESP
protocol definition into ip_vs_proto_ah.c and remove all usage of
ip_vs_proto_esp.c.

Make the compilation of ip_vs_proto_ah.c dependent on a new config
variable, IP_VS_PROTO_AH_ESP, which is selected either by
IP_VS_PROTO_ESP or IP_VS_PROTO_AH. Only compile the selected protocols'
structures within this file.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/Kconfig          |  6 ++++
 net/ipv4/ipvs/Makefile         |  3 +-
 net/ipv4/ipvs/ip_vs_proto_ah.c | 69 ++++++++++++++++++++++++++++--------------
 3 files changed, 54 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 09d0c3f3566..2e48a7e2722 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -71,14 +71,20 @@ config	IP_VS_PROTO_UDP
 	  This option enables support for load balancing UDP transport
 	  protocol. Say Y if unsure.
 
+config	IP_VS_PROTO_AH_ESP
+	bool
+	depends on UNDEFINED
+
 config	IP_VS_PROTO_ESP
 	bool "ESP load balancing support"
+	select IP_VS_PROTO_AH_ESP
 	---help---
 	  This option enables support for load balancing ESP (Encapsulation
 	  Security Payload) transport protocol. Say Y if unsure.
 
 config	IP_VS_PROTO_AH
 	bool "AH load balancing support"
+	select IP_VS_PROTO_AH_ESP
 	---help---
 	  This option enables support for load balancing AH (Authentication
 	  Header) transport protocol. Say Y if unsure.
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
index 30e85de9fff..cda3e0860d6 100644
--- a/net/ipv4/ipvs/Makefile
+++ b/net/ipv4/ipvs/Makefile
@@ -6,8 +6,7 @@
 ip_vs_proto-objs-y :=
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah.o
 
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 73e0ea87c1f..3f9ebd7639a 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -1,5 +1,5 @@
 /*
- * ip_vs_proto_ah.c:	AH IPSec load balancing support for IPVS
+ * ip_vs_proto_ah_esp.c:	AH/ESP IPSec load balancing support for IPVS
  *
  * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
  *		Wensong Zhang <wensong@linuxvirtualserver.org>
@@ -39,11 +39,11 @@ struct isakmp_hdr {
 
 
 static struct ip_vs_conn *
-ah_conn_in_get(const struct sk_buff *skb,
-	       struct ip_vs_protocol *pp,
-	       const struct iphdr *iph,
-	       unsigned int proto_off,
-	       int inverse)
+ah_esp_conn_in_get(const struct sk_buff *skb,
+		   struct ip_vs_protocol *pp,
+		   const struct iphdr *iph,
+		   unsigned int proto_off,
+		   int inverse)
 {
 	struct ip_vs_conn *cp;
 
@@ -79,8 +79,8 @@ ah_conn_in_get(const struct sk_buff *skb,
 
 
 static struct ip_vs_conn *
-ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		const struct iphdr *iph, unsigned int proto_off, int inverse)
+ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    const struct iphdr *iph, unsigned int proto_off, int inverse)
 {
 	struct ip_vs_conn *cp;
 
@@ -112,12 +112,12 @@ ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 
 static int
-ah_conn_schedule(struct sk_buff *skb,
-		 struct ip_vs_protocol *pp,
-		 int *verdict, struct ip_vs_conn **cpp)
+ah_esp_conn_schedule(struct sk_buff *skb,
+		     struct ip_vs_protocol *pp,
+		     int *verdict, struct ip_vs_conn **cpp)
 {
 	/*
-	 * AH is only related traffic. Pass the packet to IP stack.
+	 * AH/ESP is only related traffic. Pass the packet to IP stack.
 	 */
 	*verdict = NF_ACCEPT;
 	return 0;
@@ -125,8 +125,8 @@ ah_conn_schedule(struct sk_buff *skb,
 
 
 static void
-ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		int offset, const char *msg)
+ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		    int offset, const char *msg)
 {
 	char buf[256];
 	struct iphdr _iph, *ih;
@@ -143,28 +143,29 @@ ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 }
 
 
-static void ah_init(struct ip_vs_protocol *pp)
+static void ah_esp_init(struct ip_vs_protocol *pp)
 {
 	/* nothing to do now */
 }
 
 
-static void ah_exit(struct ip_vs_protocol *pp)
+static void ah_esp_exit(struct ip_vs_protocol *pp)
 {
 	/* nothing to do now */
 }
 
 
+#ifdef CONFIG_IP_VS_PROTO_AH
 struct ip_vs_protocol ip_vs_protocol_ah = {
 	.name =			"AH",
 	.protocol =		IPPROTO_AH,
 	.num_states =		1,
 	.dont_defrag =		1,
-	.init =			ah_init,
-	.exit =			ah_exit,
-	.conn_schedule =	ah_conn_schedule,
-	.conn_in_get =		ah_conn_in_get,
-	.conn_out_get =		ah_conn_out_get,
+	.init =			ah_esp_init,
+	.exit =			ah_esp_exit,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
 	.snat_handler =		NULL,
 	.dnat_handler =		NULL,
 	.csum_check =		NULL,
@@ -172,7 +173,31 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
 	.register_app =		NULL,
 	.unregister_app =	NULL,
 	.app_conn_bind =	NULL,
-	.debug_packet =		ah_debug_packet,
+	.debug_packet =		ah_esp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
 	.set_state_timeout =	NULL,
 };
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+	.name =			"ESP",
+	.protocol =		IPPROTO_ESP,
+	.num_states =		1,
+	.dont_defrag =		1,
+	.init =			ah_esp_init,
+	.exit =			ah_esp_exit,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ah_esp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+};
+#endif
-- 
cgit v1.2.3


From e3c2ced8d21410e8bc897480081e2ffc516c0f70 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Fri, 22 Aug 2008 14:06:13 +0200
Subject: IPVS: Rename ip_vs_proto_ah.c to ip_vs_proto_ah_esp.c

After integrating ESP into ip_vs_proto_ah, rename it (and the references to
it) to ip_vs_proto_ah_esp.c and delete the old ip_vs_proto_esp.c.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/Makefile             |   2 +-
 net/ipv4/ipvs/ip_vs_proto_ah.c     | 203 -------------------------------------
 net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 203 +++++++++++++++++++++++++++++++++++++
 net/ipv4/ipvs/ip_vs_proto_esp.c    | 176 --------------------------------
 4 files changed, 204 insertions(+), 380 deletions(-)
 delete mode 100644 net/ipv4/ipvs/ip_vs_proto_ah.c
 create mode 100644 net/ipv4/ipvs/ip_vs_proto_ah_esp.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_proto_esp.c

(limited to 'net')

diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
index cda3e0860d6..73a46fe1fe4 100644
--- a/net/ipv4/ipvs/Makefile
+++ b/net/ipv4/ipvs/Makefile
@@ -6,7 +6,7 @@
 ip_vs_proto-objs-y :=
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
 
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
deleted file mode 100644
index 3f9ebd7639a..00000000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * ip_vs_proto_ah_esp.c:	AH/ESP IPSec load balancing support for IPVS
- *
- * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
- *		Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		version 2 as published by the Free Software Foundation;
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/* TODO:
-
-struct isakmp_hdr {
-	__u8		icookie[8];
-	__u8		rcookie[8];
-	__u8		np;
-	__u8		version;
-	__u8		xchgtype;
-	__u8		flags;
-	__u32		msgid;
-	__u32		length;
-};
-
-*/
-
-#define PORT_ISAKMP	500
-
-
-static struct ip_vs_conn *
-ah_esp_conn_in_get(const struct sk_buff *skb,
-		   struct ip_vs_protocol *pp,
-		   const struct iphdr *iph,
-		   unsigned int proto_off,
-		   int inverse)
-{
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->saddr,
-				       htons(PORT_ISAKMP),
-				       iph->daddr,
-				       htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->daddr,
-				       htons(PORT_ISAKMP),
-				       iph->saddr,
-				       htons(PORT_ISAKMP));
-	}
-
-	if (!cp) {
-		/*
-		 * We are not sure if the packet is from our
-		 * service, so our conn_schedule hook should return NF_ACCEPT
-		 */
-		IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
-			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
-			  inverse ? "ICMP+" : "",
-			  pp->name,
-			  NIPQUAD(iph->saddr),
-			  NIPQUAD(iph->daddr));
-	}
-
-	return cp;
-}
-
-
-static struct ip_vs_conn *
-ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		    const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->saddr,
-					htons(PORT_ISAKMP),
-					iph->daddr,
-					htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->daddr,
-					htons(PORT_ISAKMP),
-					iph->saddr,
-					htons(PORT_ISAKMP));
-	}
-
-	if (!cp) {
-		IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
-			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
-			  inverse ? "ICMP+" : "",
-			  pp->name,
-			  NIPQUAD(iph->saddr),
-			  NIPQUAD(iph->daddr));
-	}
-
-	return cp;
-}
-
-
-static int
-ah_esp_conn_schedule(struct sk_buff *skb,
-		     struct ip_vs_protocol *pp,
-		     int *verdict, struct ip_vs_conn **cpp)
-{
-	/*
-	 * AH/ESP is only related traffic. Pass the packet to IP stack.
-	 */
-	*verdict = NF_ACCEPT;
-	return 0;
-}
-
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		    int offset, const char *msg)
-{
-	char buf[256];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
-	else
-		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
-			pp->name, NIPQUAD(ih->saddr),
-			NIPQUAD(ih->daddr));
-
-	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-#ifdef CONFIG_IP_VS_PROTO_AH
-struct ip_vs_protocol ip_vs_protocol_ah = {
-	.name =			"AH",
-	.protocol =		IPPROTO_AH,
-	.num_states =		1,
-	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
-	.conn_schedule =	ah_esp_conn_schedule,
-	.conn_in_get =		ah_esp_conn_in_get,
-	.conn_out_get =		ah_esp_conn_out_get,
-	.snat_handler =		NULL,
-	.dnat_handler =		NULL,
-	.csum_check =		NULL,
-	.state_transition =	NULL,
-	.register_app =		NULL,
-	.unregister_app =	NULL,
-	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
-	.timeout_change =	NULL,		/* ISAKMP */
-	.set_state_timeout =	NULL,
-};
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_ESP
-struct ip_vs_protocol ip_vs_protocol_esp = {
-	.name =			"ESP",
-	.protocol =		IPPROTO_ESP,
-	.num_states =		1,
-	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
-	.conn_schedule =	ah_esp_conn_schedule,
-	.conn_in_get =		ah_esp_conn_in_get,
-	.conn_out_get =		ah_esp_conn_out_get,
-	.snat_handler =		NULL,
-	.dnat_handler =		NULL,
-	.csum_check =		NULL,
-	.state_transition =	NULL,
-	.register_app =		NULL,
-	.unregister_app =	NULL,
-	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
-	.timeout_change =	NULL,		/* ISAKMP */
-};
-#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
new file mode 100644
index 00000000000..3f9ebd7639a
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,203 @@
+/*
+ * ip_vs_proto_ah_esp.c:	AH/ESP IPSec load balancing support for IPVS
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
+ *		Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+	__u8		icookie[8];
+	__u8		rcookie[8];
+	__u8		np;
+	__u8		version;
+	__u8		xchgtype;
+	__u8		flags;
+	__u32		msgid;
+	__u32		length;
+};
+
+*/
+
+#define PORT_ISAKMP	500
+
+
+static struct ip_vs_conn *
+ah_esp_conn_in_get(const struct sk_buff *skb,
+		   struct ip_vs_protocol *pp,
+		   const struct iphdr *iph,
+		   unsigned int proto_off,
+		   int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_in_get(IPPROTO_UDP,
+				       iph->saddr,
+				       htons(PORT_ISAKMP),
+				       iph->daddr,
+				       htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_in_get(IPPROTO_UDP,
+				       iph->daddr,
+				       htons(PORT_ISAKMP),
+				       iph->saddr,
+				       htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		/*
+		 * We are not sure if the packet is from our
+		 * service, so our conn_schedule hook should return NF_ACCEPT
+		 */
+		IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+			  inverse ? "ICMP+" : "",
+			  pp->name,
+			  NIPQUAD(iph->saddr),
+			  NIPQUAD(iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_out_get(IPPROTO_UDP,
+					iph->saddr,
+					htons(PORT_ISAKMP),
+					iph->daddr,
+					htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_out_get(IPPROTO_UDP,
+					iph->daddr,
+					htons(PORT_ISAKMP),
+					iph->saddr,
+					htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+			  inverse ? "ICMP+" : "",
+			  pp->name,
+			  NIPQUAD(iph->saddr),
+			  NIPQUAD(iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static int
+ah_esp_conn_schedule(struct sk_buff *skb,
+		     struct ip_vs_protocol *pp,
+		     int *verdict, struct ip_vs_conn **cpp)
+{
+	/*
+	 * AH/ESP is only related traffic. Pass the packet to IP stack.
+	 */
+	*verdict = NF_ACCEPT;
+	return 0;
+}
+
+
+static void
+ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		    int offset, const char *msg)
+{
+	char buf[256];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else
+		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+			pp->name, NIPQUAD(ih->saddr),
+			NIPQUAD(ih->daddr));
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+static void ah_esp_init(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+static void ah_esp_exit(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+#ifdef CONFIG_IP_VS_PROTO_AH
+struct ip_vs_protocol ip_vs_protocol_ah = {
+	.name =			"AH",
+	.protocol =		IPPROTO_AH,
+	.num_states =		1,
+	.dont_defrag =		1,
+	.init =			ah_esp_init,
+	.exit =			ah_esp_exit,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ah_esp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+	.set_state_timeout =	NULL,
+};
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+	.name =			"ESP",
+	.protocol =		IPPROTO_ESP,
+	.num_states =		1,
+	.dont_defrag =		1,
+	.init =			ah_esp_init,
+	.exit =			ah_esp_exit,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ah_esp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+};
+#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
deleted file mode 100644
index 21d70c8ffa5..00000000000
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * ip_vs_proto_esp.c:	ESP IPSec load balancing support for IPVS
- *
- * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
- *		Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		version 2 as published by the Free Software Foundation;
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/* TODO:
-
-struct isakmp_hdr {
-	__u8		icookie[8];
-	__u8		rcookie[8];
-	__u8		np;
-	__u8		version;
-	__u8		xchgtype;
-	__u8		flags;
-	__u32		msgid;
-	__u32		length;
-};
-
-*/
-
-#define PORT_ISAKMP	500
-
-
-static struct ip_vs_conn *
-esp_conn_in_get(const struct sk_buff *skb,
-		struct ip_vs_protocol *pp,
-		const struct iphdr *iph,
-		unsigned int proto_off,
-		int inverse)
-{
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->saddr,
-				       htons(PORT_ISAKMP),
-				       iph->daddr,
-				       htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->daddr,
-				       htons(PORT_ISAKMP),
-				       iph->saddr,
-				       htons(PORT_ISAKMP));
-	}
-
-	if (!cp) {
-		/*
-		 * We are not sure if the packet is from our
-		 * service, so our conn_schedule hook should return NF_ACCEPT
-		 */
-		IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
-			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
-			  inverse ? "ICMP+" : "",
-			  pp->name,
-			  NIPQUAD(iph->saddr),
-			  NIPQUAD(iph->daddr));
-	}
-
-	return cp;
-}
-
-
-static struct ip_vs_conn *
-esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		 const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->saddr,
-					htons(PORT_ISAKMP),
-					iph->daddr,
-					htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->daddr,
-					htons(PORT_ISAKMP),
-					iph->saddr,
-					htons(PORT_ISAKMP));
-	}
-
-	if (!cp) {
-		IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
-			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
-			  inverse ? "ICMP+" : "",
-			  pp->name,
-			  NIPQUAD(iph->saddr),
-			  NIPQUAD(iph->daddr));
-	}
-
-	return cp;
-}
-
-
-static int
-esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
-		  int *verdict, struct ip_vs_conn **cpp)
-{
-	/*
-	 * ESP is only related traffic. Pass the packet to IP stack.
-	 */
-	*verdict = NF_ACCEPT;
-	return 0;
-}
-
-
-static void
-esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		 int offset, const char *msg)
-{
-	char buf[256];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
-	else
-		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
-			pp->name, NIPQUAD(ih->saddr),
-			NIPQUAD(ih->daddr));
-
-	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-
-static void esp_init(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-static void esp_exit(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_esp = {
-	.name =			"ESP",
-	.protocol =		IPPROTO_ESP,
-	.num_states =		1,
-	.dont_defrag =		1,
-	.init =			esp_init,
-	.exit =			esp_exit,
-	.conn_schedule =	esp_conn_schedule,
-	.conn_in_get =		esp_conn_in_get,
-	.conn_out_get =		esp_conn_out_get,
-	.snat_handler =		NULL,
-	.dnat_handler =		NULL,
-	.csum_check =		NULL,
-	.state_transition =	NULL,
-	.register_app =		NULL,
-	.unregister_app =	NULL,
-	.app_conn_bind =	NULL,
-	.debug_packet =		esp_debug_packet,
-	.timeout_change =	NULL,		/* ISAKMP */
-};
-- 
cgit v1.2.3


From 33c449675c0e371edd35b3bd7ce8a14451ff2f0b Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Always generate a Reset in response to option errors

RFC4340 states that if a packet is received with an option error (such as a
Mandatory Option as the last byte of the option list), the endpoint should
repond with a Reset.

In the LISTEN and RESPOND states, the endpoint correctly reponds with Reset,
while in the REQUEST/OPEN states, packets with option errors are just ignored.

The packet sequence is as follows:

Case 1:

  Endpoint A                           Endpoint B
  (CLOSED)                             (CLOSED)

               <----------------       REQUEST

  RESPONSE     ----------------->      (*1)
  (with invalid option)
               <----------------       RESET
                                       (with Reset Code 5, "Option Error")

  (*1) currently just ignored, no Reset is sent

Case 2:

  Endpoint A                           Endpoint B
  (OPEN)                               (OPEN)

  DATA-ACK     ----------------->      (*2)
  (with invalid option)
               <----------------       RESET
                                       (with Reset Code 5, "Option Error")

  (*2) currently just ignored, no Reset is sent

This patch fixes the problem, by generating a Reset instead of silently
ignoring option errors.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 803933ab396..779d0ed9ae9 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -370,7 +370,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 
 	if (dccp_parse_options(sk, NULL, skb))
-		goto discard;
+		return 1;
 
 	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 		dccp_event_ack_recv(sk, skb);
@@ -610,7 +610,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		 * Step 8: Process options and mark acknowledgeable
 		 */
 		if (dccp_parse_options(sk, NULL, skb))
-			goto discard;
+			return 1;
 
 		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 			dccp_event_ack_recv(sk, skb);
-- 
cgit v1.2.3


From 1efa6bbac876318ebf6f3a757f18e7d9ebe02dd0 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Silently ignore options with nonsensical lengths

This updates the option-parsing code with regard to RFC 4340, 5.8:
 "[..] options with nonsensical lengths (length byte less than two or more
  than the remaining space in the options portion of the header) MUST be
  ignored, and any option space following an option with nonsensical length
  MUST likewise be ignored."

Hence in the following cases erratic options will be ignored:
 1. The type byte of a multi-byte option is the last byte of the header
    options (i.e. effective option length of 1).
 2. The value of the length byte is less than the minimum 2. This has been
    changed from previously 3: although no multi-byte option with a length
    less than 3 yet exists (cf. table 3 in 5.8), a length of 2 is valid.
    (The switch-statement in dccp_parse has further per-option length checks.)
 3. The option length exceeds the length of the remaining option space.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index dc7c158a2f4..4284f085604 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -81,11 +81,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		/* Check if this isn't a single byte option */
 		if (opt > DCCPO_MAX_RESERVED) {
 			if (opt_ptr == opt_end)
-				goto out_invalid_option;
+				goto out_nonsensical_length;
 
 			len = *opt_ptr++;
-			if (len < 3)
-				goto out_invalid_option;
+			if (len < 2)
+				goto out_nonsensical_length;
 			/*
 			 * Remove the type and len fields, leaving
 			 * just the value size
@@ -95,7 +95,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			opt_ptr += len;
 
 			if (opt_ptr > opt_end)
-				goto out_invalid_option;
+				goto out_nonsensical_length;
 		}
 
 		/*
@@ -283,6 +283,8 @@ ignore_option:
 	if (mandatory)
 		goto out_invalid_option;
 
+out_nonsensical_length:
+	/* RFC 4340, 5.8: ignore option and all remaining option space */
 	return 0;
 
 out_invalid_option:
-- 
cgit v1.2.3


From 5a056417e696fabab8642ec38783de0b496bde76 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Fill in the Data fields for "Option Error" Resets

This updates the use of the `out_invalid_option' label, which produces a
Reset (code 5, "Option Error"), to fill in the  Data1...Data3 fields as
specified in RFC 4340, 5.6.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index 4284f085604..0809b63cb05 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -291,6 +291,9 @@ out_invalid_option:
 	DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
 	DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
+	DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
+	DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
+	DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
 	return -1;
 }
 
-- 
cgit v1.2.3


From b569d5a134074d4e15ab8e26cf2dd9f02c29fadc Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Empty the write queue when disconnecting

dccp_disconnect() can be called due to several reasons:

 1. when the connection setup failed (inet_stream_connect());
 2. when shutting down (inet_shutdown(), inet_csk_listen_stop());
 3. when aborting the connection (dccp_close() with 0 linger time).

In case (1) the write queue is empty. This patch empties the write queue,
if in case (2) or (3) it was not yet empty.

This avoids triggering the write-queue BUG_TRAP in sk_stream_kill_queues()
later on.

It also seems natural to do: when breaking an association, to delete all
packets that were originally intended for the soon-disconnected end (compare
with call to tcp_write_queue_purge in tcp_disconnect()).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/proto.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 1ca3b26eed0..ae66473b11f 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -309,7 +309,9 @@ int dccp_disconnect(struct sock *sk, int flags)
 		sk->sk_err = ECONNRESET;
 
 	dccp_clear_xmit_timers(sk);
+
 	__skb_queue_purge(&sk->sk_receive_queue);
+	__skb_queue_purge(&sk->sk_write_queue);
 	if (sk->sk_send_head != NULL) {
 		__kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
-- 
cgit v1.2.3


From 157439fa4a9b38ac4ce41e2fc379fc5031affec8 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Toggle debug output without module unloading

This sets the sysfs permissions so that root can toggle the `debug'
parameter available for nearly every DCCP module. This is useful
since there are various module inter-dependencies. The debug flag
can now be toggled at runtime using

  echo 1 > /sys/module/dccp/parameters/dccp_debug
  echo 1 > /sys/module/dccp_ccid2/parameters/ccid2_debug
  echo 1 > /sys/module/dccp_ccid3/parameters/ccid3_debug
  echo 1 > /sys/module/dccp_tfrc_lib/parameters/tfrc_debug

The last is not very useful yet, since no code at the moment calls
the tfrc_debug() macro.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c    | 2 +-
 net/dccp/ccids/ccid3.c    | 2 +-
 net/dccp/ccids/lib/tfrc.c | 2 +-
 net/dccp/proto.c          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 8e958087421..9a430734530 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -783,7 +783,7 @@ static struct ccid_operations ccid2 = {
 };
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-module_param(ccid2_debug, bool, 0444);
+module_param(ccid2_debug, bool, 0644);
 MODULE_PARM_DESC(ccid2_debug, "Enable debug messages");
 #endif
 
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index f6756e0c9e6..3b8bd7ca676 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -963,7 +963,7 @@ static struct ccid_operations ccid3 = {
 };
 
 #ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-module_param(ccid3_debug, bool, 0444);
+module_param(ccid3_debug, bool, 0644);
 MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
 #endif
 
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
index 97ecec0a8e7..185916218e0 100644
--- a/net/dccp/ccids/lib/tfrc.c
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -10,7 +10,7 @@
 
 #ifdef CONFIG_IP_DCCP_TFRC_DEBUG
 int tfrc_debug;
-module_param(tfrc_debug, bool, 0444);
+module_param(tfrc_debug, bool, 0644);
 MODULE_PARM_DESC(tfrc_debug, "Enable debug messages");
 #endif
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ae66473b11f..d0bd3481976 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1030,7 +1030,7 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
 
 #ifdef CONFIG_IP_DCCP_DEBUG
 int dccp_debug;
-module_param(dccp_debug, bool, 0444);
+module_param(dccp_debug, bool, 0644);
 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
 
 EXPORT_SYMBOL_GPL(dccp_debug);
-- 
cgit v1.2.3


From eff253c4272cd2aac95ccff46d3d2e1a495f22b1 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp ccid-3: Replace lazy BUG_ON with condition

The BUG_ON(w_tot == 0) only holds if there is no more than 1 loss interval in
the loss history. If there is only a single loss interval, the calc_i_mean()
routine need in fact not be called (RFC 3448, 6.3.1).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/lib/loss_interval.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index bcd6ac415bb..5b3ce0688c5 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -67,7 +67,10 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
 	u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
 	int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
 
-	for (i=0; i <= k; i++) {
+	if (k <= 0)
+		return;
+
+	for (i = 0; i <= k; i++) {
 		i_i = tfrc_lh_get_interval(lh, i);
 
 		if (i < k) {
@@ -78,7 +81,6 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
 			i_tot1 += i_i * tfrc_lh_weights[i-1];
 	}
 
-	BUG_ON(w_tot == 0);
 	lh->i_mean = max(i_tot0, i_tot1) / w_tot;
 }
 
-- 
cgit v1.2.3


From 6eac56040787c3ff604fe7d48bbbb7897cd1387c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 28 Aug 2008 01:08:02 -0700
Subject: tcp: Skip empty hash buckets faster in /proc/net/tcp

On most systems most of the TCP established/time-wait hash buckets are empty.
When walking the hash table for /proc/net/tcp their read locks would
always be aquired just to find out they're empty. This patch changes the code
to check first if the buckets have any entries before taking the lock, which
is much cheaper than taking a lock. Since the hash tables are large
this makes a measurable difference on processing /proc/net/tcp,
especially on architectures with slow read_lock (e.g. PPC)

On a 2GB Core2 system time cat /proc/net/tcp > /dev/null (with a mostly
empty hash table) goes from 0.046s to 0.005s.

On systems with slower atomics (like P4 or POWER4) or larger hash tables
(more RAM) the difference is much higher.

This can be noticeable because there are some daemons around who regularly
scan /proc/net/tcp.

Original idea for this patch from Marcus Meissner, but redone by me.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 44c1e934824..37ca3843c40 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1946,6 +1946,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 	return rc;
 }
 
+static inline int empty_bucket(struct tcp_iter_state *st)
+{
+	return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
+		hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+}
+
 static void *established_get_first(struct seq_file *seq)
 {
 	struct tcp_iter_state* st = seq->private;
@@ -1958,6 +1964,10 @@ static void *established_get_first(struct seq_file *seq)
 		struct inet_timewait_sock *tw;
 		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
 
+		/* Lockless fast path for the common case of empty buckets */
+		if (empty_bucket(st))
+			continue;
+
 		read_lock_bh(lock);
 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family ||
@@ -2008,13 +2018,15 @@ get_tw:
 		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 
-		if (++st->bucket < tcp_hashinfo.ehash_size) {
-			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
-			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
-		} else {
-			cur = NULL;
-			goto out;
-		}
+		/* Look for next non empty bucket */
+		while (++st->bucket < tcp_hashinfo.ehash_size &&
+				empty_bucket(st))
+			;
+		if (st->bucket >= tcp_hashinfo.ehash_size)
+			return NULL;
+
+		read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+		sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
 	} else
 		sk = sk_next(sk);
 
-- 
cgit v1.2.3


From 6be547a61d6220199826070cda792297c3d15994 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 28 Aug 2008 01:09:54 -0700
Subject: inet_diag: Add empty bucket optimization to inet_diag too

Skip quickly over empty buckets in inet_diag.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c10036e7a46..89cb047ab31 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -782,11 +782,15 @@ skip_listen_ht:
 		struct sock *sk;
 		struct hlist_node *node;
 
+		num = 0;
+
+		if (hlist_empty(&head->chain) && hlist_empty(&head->twchain))
+			continue;
+
 		if (i > s_i)
 			s_num = 0;
 
 		read_lock_bh(lock);
-		num = 0;
 		sk_for_each(sk, node, &head->chain) {
 			struct inet_sock *inet = inet_sk(sk);
 
-- 
cgit v1.2.3


From a627266570605a98c5fda5b8234d9e92015e4d14 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 28 Aug 2008 01:11:25 -0700
Subject: ip: speedup /proc/net/rt_cache handling

When scanning route cache hash table, we can avoid taking locks for
empty buckets.  Both /proc/net/rt_cache and NETLINK RTM_GETROUTE
interface are taken into account.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index cca921ea855..71598f64c11 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -282,6 +282,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
 	struct rtable *r = NULL;
 
 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+		if (!rt_hash_table[st->bucket].chain)
+			continue;
 		rcu_read_lock_bh();
 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
 		while (r) {
@@ -299,11 +301,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 					  struct rtable *r)
 {
 	struct rt_cache_iter_state *st = seq->private;
+
 	r = r->u.dst.rt_next;
 	while (!r) {
 		rcu_read_unlock_bh();
-		if (--st->bucket < 0)
-			break;
+		do {
+			if (--st->bucket < 0)
+				return NULL;
+		} while (!rt_hash_table[st->bucket].chain);
 		rcu_read_lock_bh();
 		r = rt_hash_table[st->bucket].chain;
 	}
@@ -2840,7 +2845,9 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 	if (s_h < 0)
 		s_h = 0;
 	s_idx = idx = cb->args[1];
-	for (h = s_h; h <= rt_hash_mask; h++) {
+	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
+		if (!rt_hash_table[h].chain)
+			continue;
 		rcu_read_lock_bh();
 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
@@ -2859,7 +2866,6 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 			dst_release(xchg(&skb->dst, NULL));
 		}
 		rcu_read_unlock_bh();
-		s_idx = 0;
 	}
 
 done:
-- 
cgit v1.2.3


From 7f93ea3e246db512c0c17b79847f57dd3a2891e1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 6 Aug 2008 21:45:26 +0200
Subject: mac80211: fill start-sequence-number for BA session start

Otherwise, drivers are required to keep track of the sequence numbers
themselves, and they really shouldn't be since we already do it for
them. I'll fix the race once we figure out how this code should work
at all, it's currently disabled.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 398ca66bdfc..638b75f36e2 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -598,7 +598,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct sta_info *sta;
 	struct ieee80211_sub_if_data *sdata;
-	u16 start_seq_num = 0;
+	u16 start_seq_num;
 	u8 *state;
 	int ret;
 	DECLARE_MAC_BUF(mac);
@@ -678,6 +678,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	 * call back right away, it must see that the flow has begun */
 	*state |= HT_ADDBA_REQUESTED_MSK;
 
+	/* This is slightly racy because the queue isn't stopped */
+	start_seq_num = sta->tid_seq[tid];
+
 	if (local->ops->ampdu_action)
 		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START,
 						ra, tid, &start_seq_num);
-- 
cgit v1.2.3


From 9f1ba9062e032fb7b395cd27fc564754fe4e9867 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Thu, 7 Aug 2008 20:07:01 +0300
Subject: mac80211/cfg80211: Add BSS configuration options for AP mode

This change adds a new cfg80211 command, NL80211_CMD_SET_BSS, to allow
AP mode BSS parameters to be changed from user space (e.g., hostapd).
The drivers using mac80211 are expected to be modified with separate
changes to use the new BSS info parameter for short slot time in the
bss_info_changed() handler.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c     | 37 +++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 6d2ad2bf3ab..2b19532f4c8 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1010,6 +1010,42 @@ static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev,
 }
 #endif
 
+static int ieee80211_change_bss(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct bss_parameters *params)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_sub_if_data *sdata;
+	u32 changed = 0;
+
+	if (dev == local->mdev)
+		return -EOPNOTSUPP;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (sdata->vif.type != IEEE80211_IF_TYPE_AP)
+		return -EINVAL;
+
+	if (params->use_cts_prot >= 0) {
+		sdata->bss_conf.use_cts_prot = params->use_cts_prot;
+		changed |= BSS_CHANGED_ERP_CTS_PROT;
+	}
+	if (params->use_short_preamble >= 0) {
+		sdata->bss_conf.use_short_preamble =
+			params->use_short_preamble;
+		changed |= BSS_CHANGED_ERP_PREAMBLE;
+	}
+	if (params->use_short_slot_time >= 0) {
+		sdata->bss_conf.use_short_slot =
+			params->use_short_slot_time;
+		changed |= BSS_CHANGED_ERP_SLOT;
+	}
+
+	ieee80211_bss_info_change_notify(sdata, changed);
+
+	return 0;
+}
+
 struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -1033,4 +1069,5 @@ struct cfg80211_ops mac80211_config_ops = {
 	.get_mpath = ieee80211_get_mpath,
 	.dump_mpath = ieee80211_dump_mpath,
 #endif
+	.change_bss = ieee80211_change_bss,
 };
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 59eb2cf42e5..47542ee01c5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -87,6 +87,10 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY,
 				.len = IEEE80211_MAX_MESH_ID_LEN },
 	[NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 },
+
+	[NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
+	[NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
+	[NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
 };
 
 /* message building helper */
@@ -1525,6 +1529,48 @@ static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	int err;
+	struct net_device *dev;
+	struct bss_parameters params;
+
+	memset(&params, 0, sizeof(params));
+	/* default to not changing parameters */
+	params.use_cts_prot = -1;
+	params.use_short_preamble = -1;
+	params.use_short_slot_time = -1;
+
+	if (info->attrs[NL80211_ATTR_BSS_CTS_PROT])
+		params.use_cts_prot =
+		    nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]);
+	if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE])
+		params.use_short_preamble =
+		    nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]);
+	if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME])
+		params.use_short_slot_time =
+		    nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]);
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		return err;
+
+	if (!drv->ops->change_bss) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rtnl_lock();
+	err = drv->ops->change_bss(&drv->wiphy, dev, &params);
+	rtnl_unlock();
+
+ out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+	return err;
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -1656,6 +1702,12 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_SET_BSS,
+		.doit = nl80211_set_bss,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 /* multicast groups */
-- 
cgit v1.2.3


From 43ac2ca3840f64f699a239535c590fa7ebaaac27 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Fri, 15 Aug 2008 22:21:27 +0300
Subject: mac80211: Handle scan result IEs in one block

Clean up and extend scan result processing by storing all the IEs from
Beacon/Probe Response frames in a single block instead of allocating
memory for each specific IE separately. This removes lot of unnecessary
code and automatically supports reporting of new IEs (e.g., IEEE
802.11r) into user space without need to manually extend mac80211
scanning code whenever a new protocol adds IE(s).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  18 ++--
 net/mac80211/mlme.c        | 230 ++++++++++++++++-----------------------------
 2 files changed, 89 insertions(+), 159 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8361054fb7c..2bb546744b9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -79,16 +79,11 @@ struct ieee80211_sta_bss {
 	enum ieee80211_band band;
 	int freq;
 	int signal, noise, qual;
-	u8 *wpa_ie;
-	size_t wpa_ie_len;
-	u8 *rsn_ie;
-	size_t rsn_ie_len;
-	u8 *wmm_ie;
-	size_t wmm_ie_len;
-	u8 *ht_ie;
-	size_t ht_ie_len;
-	u8 *ht_add_ie;
-	size_t ht_add_ie_len;
+	u8 *ies; /* all information elements from the last Beacon or Probe
+		  * Response frames; note Beacon frame is not allowed to
+		  * override values from Probe Response */
+	size_t ies_len;
+	bool wmm_used;
 #ifdef CONFIG_MAC80211_MESH
 	u8 *mesh_id;
 	size_t mesh_id_len;
@@ -773,6 +768,9 @@ struct ieee80211_ra_tid {
 
 /* Parsed Information Elements */
 struct ieee802_11_elems {
+	u8 *ie_start;
+	size_t total_len;
+
 	/* pointers to IEs */
 	u8 *ssid;
 	u8 *supp_rates;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 84999791a33..e088b440aaf 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -98,6 +98,8 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 	u8 *pos = start;
 
 	memset(elems, 0, sizeof(*elems));
+	elems->ie_start = start;
+	elems->total_len = len;
 
 	while (left >= 2) {
 		u8 id, elen;
@@ -234,6 +236,27 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 }
 
 
+static u8 * ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
+{
+	u8 *end, *pos;
+
+	pos = bss->ies;
+	if (pos == NULL)
+		return NULL;
+	end = pos + bss->ies_len;
+
+	while (pos + 1 < end) {
+		if (pos + 2 + pos[1] > end)
+			break;
+		if (pos[0] == ie)
+			return pos;
+		pos += 2 + pos[1];
+	}
+
+	return NULL;
+}
+
+
 static int ecw2cw(int ecw)
 {
 	return (1 << ecw) - 1;
@@ -737,7 +760,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *ies;
+	u8 *pos, *ies, *ht_add_ie;
 	int i, len, count, rates_len, supp_rates_len;
 	u16 capab;
 	struct ieee80211_sta_bss *bss;
@@ -772,7 +795,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	if (bss) {
 		if (bss->capability & WLAN_CAPABILITY_PRIVACY)
 			capab |= WLAN_CAPABILITY_PRIVACY;
-		if (bss->wmm_ie)
+		if (bss->wmm_used)
 			wmm = 1;
 
 		/* get all rates supported by the device and the AP as
@@ -894,9 +917,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 
 	/* wmm support is a must to HT */
 	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) &&
-	    sband->ht_info.ht_supported && bss->ht_add_ie) {
+	    sband->ht_info.ht_supported &&
+	    (ht_add_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_EXTRA_INFO))) {
 		struct ieee80211_ht_addt_info *ht_add_info =
-			(struct ieee80211_ht_addt_info *)bss->ht_add_ie;
+			(struct ieee80211_ht_addt_info *)ht_add_ie;
 		u16 cap = sband->ht_info.cap;
 		__le16 tmp;
 		u32 flags = local->hw.conf.channel->flags;
@@ -2372,11 +2396,7 @@ ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_i
 
 static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
 {
-	kfree(bss->wpa_ie);
-	kfree(bss->rsn_ie);
-	kfree(bss->wmm_ie);
-	kfree(bss->ht_ie);
-	kfree(bss->ht_add_ie);
+	kfree(bss->ies);
 	kfree(bss_mesh_id(bss));
 	kfree(bss_mesh_cfg(bss));
 	kfree(bss);
@@ -2662,43 +2682,6 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		bss->has_erp_value = 1;
 	}
 
-	if (elems->ht_cap_elem &&
-	     (!bss->ht_ie || bss->ht_ie_len != elems->ht_cap_elem_len ||
-	     memcmp(bss->ht_ie, elems->ht_cap_elem, elems->ht_cap_elem_len))) {
-		kfree(bss->ht_ie);
-		bss->ht_ie = kmalloc(elems->ht_cap_elem_len + 2, GFP_ATOMIC);
-		if (bss->ht_ie) {
-			memcpy(bss->ht_ie, elems->ht_cap_elem - 2,
-				elems->ht_cap_elem_len + 2);
-			bss->ht_ie_len = elems->ht_cap_elem_len + 2;
-		} else
-			bss->ht_ie_len = 0;
-	} else if (!elems->ht_cap_elem && bss->ht_ie) {
-		kfree(bss->ht_ie);
-		bss->ht_ie = NULL;
-		bss->ht_ie_len = 0;
-	}
-
-	if (elems->ht_info_elem &&
-	     (!bss->ht_add_ie ||
-	     bss->ht_add_ie_len != elems->ht_info_elem_len ||
-	     memcmp(bss->ht_add_ie, elems->ht_info_elem,
-			elems->ht_info_elem_len))) {
-		kfree(bss->ht_add_ie);
-		bss->ht_add_ie =
-			kmalloc(elems->ht_info_elem_len + 2, GFP_ATOMIC);
-		if (bss->ht_add_ie) {
-			memcpy(bss->ht_add_ie, elems->ht_info_elem - 2,
-				elems->ht_info_elem_len + 2);
-			bss->ht_add_ie_len = elems->ht_info_elem_len + 2;
-		} else
-			bss->ht_add_ie_len = 0;
-	} else if (!elems->ht_info_elem && bss->ht_add_ie) {
-		kfree(bss->ht_add_ie);
-		bss->ht_add_ie = NULL;
-		bss->ht_add_ie_len = 0;
-	}
-
 	bss->beacon_int = le16_to_cpu(mgmt->u.beacon.beacon_int);
 	bss->capability = le16_to_cpu(mgmt->u.beacon.capab_info);
 
@@ -2749,88 +2732,17 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
-	if (elems->wpa &&
-	    (!bss->wpa_ie || bss->wpa_ie_len != elems->wpa_len ||
-	     memcmp(bss->wpa_ie, elems->wpa, elems->wpa_len))) {
-		kfree(bss->wpa_ie);
-		bss->wpa_ie = kmalloc(elems->wpa_len + 2, GFP_ATOMIC);
-		if (bss->wpa_ie) {
-			memcpy(bss->wpa_ie, elems->wpa - 2, elems->wpa_len + 2);
-			bss->wpa_ie_len = elems->wpa_len + 2;
-		} else
-			bss->wpa_ie_len = 0;
-	} else if (!elems->wpa && bss->wpa_ie) {
-		kfree(bss->wpa_ie);
-		bss->wpa_ie = NULL;
-		bss->wpa_ie_len = 0;
-	}
-
-	if (elems->rsn &&
-	    (!bss->rsn_ie || bss->rsn_ie_len != elems->rsn_len ||
-	     memcmp(bss->rsn_ie, elems->rsn, elems->rsn_len))) {
-		kfree(bss->rsn_ie);
-		bss->rsn_ie = kmalloc(elems->rsn_len + 2, GFP_ATOMIC);
-		if (bss->rsn_ie) {
-			memcpy(bss->rsn_ie, elems->rsn - 2, elems->rsn_len + 2);
-			bss->rsn_ie_len = elems->rsn_len + 2;
-		} else
-			bss->rsn_ie_len = 0;
-	} else if (!elems->rsn && bss->rsn_ie) {
-		kfree(bss->rsn_ie);
-		bss->rsn_ie = NULL;
-		bss->rsn_ie_len = 0;
+	if (bss->ies == NULL || bss->ies_len < elems->total_len) {
+		kfree(bss->ies);
+		bss->ies = kmalloc(elems->total_len, GFP_ATOMIC);
 	}
+	if (bss->ies) {
+		memcpy(bss->ies, elems->ie_start, elems->total_len);
+		bss->ies_len = elems->total_len;
+	} else
+		bss->ies_len = 0;
 
-	/*
-	 * Cf.
-	 * http://www.wipo.int/pctdb/en/wo.jsp?wo=2007047181&IA=WO2007047181&DISPLAY=DESC
-	 *
-	 * quoting:
-	 *
-	 * In particular, "Wi-Fi CERTIFIED for WMM - Support for Multimedia
-	 * Applications with Quality of Service in Wi-Fi Networks," Wi- Fi
-	 * Alliance (September 1, 2004) is incorporated by reference herein.
-	 * The inclusion of the WMM Parameters in probe responses and
-	 * association responses is mandatory for WMM enabled networks. The
-	 * inclusion of the WMM Parameters in beacons, however, is optional.
-	 */
-
-	if (elems->wmm_param &&
-	    (!bss->wmm_ie || bss->wmm_ie_len != elems->wmm_param_len ||
-	     memcmp(bss->wmm_ie, elems->wmm_param, elems->wmm_param_len))) {
-		kfree(bss->wmm_ie);
-		bss->wmm_ie = kmalloc(elems->wmm_param_len + 2, GFP_ATOMIC);
-		if (bss->wmm_ie) {
-			memcpy(bss->wmm_ie, elems->wmm_param - 2,
-			       elems->wmm_param_len + 2);
-			bss->wmm_ie_len = elems->wmm_param_len + 2;
-		} else
-			bss->wmm_ie_len = 0;
-	} else if (elems->wmm_info &&
-		    (!bss->wmm_ie || bss->wmm_ie_len != elems->wmm_info_len ||
-		     memcmp(bss->wmm_ie, elems->wmm_info,
-						elems->wmm_info_len))) {
-		 /* As for certain AP's Fifth bit is not set in WMM IE in
-		  * beacon frames.So while parsing the beacon frame the
-		  * wmm_info structure is used instead of wmm_param.
-		  * wmm_info structure was never used to set bss->wmm_ie.
-		  * This code fixes this problem by copying the WME
-		  * information from wmm_info to bss->wmm_ie and enabling
-		  * n-band association.
-		  */
-		kfree(bss->wmm_ie);
-		bss->wmm_ie = kmalloc(elems->wmm_info_len + 2, GFP_ATOMIC);
-		if (bss->wmm_ie) {
-			memcpy(bss->wmm_ie, elems->wmm_info - 2,
-			       elems->wmm_info_len + 2);
-			bss->wmm_ie_len = elems->wmm_info_len + 2;
-		} else
-			bss->wmm_ie_len = 0;
-	} else if (!elems->wmm_param && !elems->wmm_info && bss->wmm_ie) {
-		kfree(bss->wmm_ie);
-		bss->wmm_ie = NULL;
-		bss->wmm_ie_len = 0;
-	}
+	bss->wmm_used = elems->wmm_param || elems->wmm_info;
 
 	/* check if we need to merge IBSS */
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && beacon &&
@@ -4146,6 +4058,48 @@ int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t
 	return 0;
 }
 
+
+static void ieee80211_sta_add_scan_ies(struct iw_request_info *info,
+				       struct ieee80211_sta_bss *bss,
+				       char **current_ev, char *end_buf)
+{
+	u8 *pos, *end, *next;
+	struct iw_event iwe;
+
+	if (bss == NULL || bss->ies == NULL)
+		return;
+
+	/*
+	 * If needed, fragment the IEs buffer (at IE boundaries) into short
+	 * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
+	 */
+	pos = bss->ies;
+	end = pos + bss->ies_len;
+
+	while (end - pos > IW_GENERIC_IE_MAX) {
+		next = pos + 2 + pos[1];
+		while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
+			next = next + 2 + next[1];
+
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = next - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+
+		pos = next;
+	}
+
+	if (end > pos) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = end - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+	}
+}
+
+
 static char *
 ieee80211_sta_scan_result(struct ieee80211_local *local,
 			  struct iw_request_info *info,
@@ -4225,29 +4179,7 @@ ieee80211_sta_scan_result(struct ieee80211_local *local,
 	current_ev = iwe_stream_add_point(info, current_ev, end_buf,
 					  &iwe, "");
 
-	if (bss && bss->wpa_ie) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVGENIE;
-		iwe.u.data.length = bss->wpa_ie_len;
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, bss->wpa_ie);
-	}
-
-	if (bss && bss->rsn_ie) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVGENIE;
-		iwe.u.data.length = bss->rsn_ie_len;
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, bss->rsn_ie);
-	}
-
-	if (bss && bss->ht_ie) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVGENIE;
-		iwe.u.data.length = bss->ht_ie_len;
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, bss->ht_ie);
-	}
+	ieee80211_sta_add_scan_ies(info, bss, &current_ev, end_buf);
 
 	if (bss && bss->supp_rates_len > 0) {
 		/* display all supported rates in readable format */
-- 
cgit v1.2.3


From 2f58bbf27fe5321a7a208be9071efc54e8a8a3bd Mon Sep 17 00:00:00 2001
From: Daniel Wagner <wagi@monom.org>
Date: Tue, 19 Aug 2008 15:44:35 +0200
Subject: mac80211: Use only precedence level of DSCP field for frame
 classification

Bit 4-5 of DSCP should not be considered by classify_d1. The
802.11 QoS Priority field is only depending on the precedence level.

Signed-off-by: Daniel Wagner <wagi@monom.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wme.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 4310e2f6566..7229e958879 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -47,8 +47,6 @@ static unsigned int classify_1d(struct sk_buff *skb)
 		return 0;
 	}
 
-	if (dscp & 0x1c)
-		return 0;
 	return dscp >> 5;
 }
 
-- 
cgit v1.2.3


From 36aedc903ea11a4188de0a118d26c9f20afdd272 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Mon, 25 Aug 2008 11:58:58 +0300
Subject: mac80211/cfg80211: HT capabilities for NEW_STA

Allow userspace (e.g., hostapd) to set HT capabilities for associated
STAs. This is based on a patch from Zhu Yi <yi.zhu@intel.com> (only
the NL80211_ATTR_HT_CAPABILITY for NEW_STA part is included here).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c     |  5 +++++
 net/wireless/nl80211.c | 10 ++++++++++
 2 files changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 2b19532f4c8..928813ce08e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -674,6 +674,11 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 		sta->supp_rates[local->oper_channel->band] = rates;
 	}
 
+	if (params->ht_capa) {
+		ieee80211_ht_cap_ie_to_ht_info(params->ht_capa,
+					       &sta->ht_info);
+	}
+
 	if (ieee80211_vif_is_mesh(&sdata->vif) && params->plink_action) {
 		switch (params->plink_action) {
 		case PLINK_ACTION_OPEN:
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 47542ee01c5..4d6c02afd6f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -91,6 +91,9 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
+
+	[NL80211_ATTR_HT_CAPABILITY] = { .type = NLA_BINARY,
+					 .len = NL80211_HT_CAPABILITY_LEN },
 };
 
 /* message building helper */
@@ -1129,6 +1132,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 		params.listen_interval =
 		    nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
 
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+		params.ht_capa =
+			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
+
 	if (parse_station_flags(info->attrs[NL80211_ATTR_STA_FLAGS],
 				&params.station_flags))
 		return -EINVAL;
@@ -1192,6 +1199,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	params.listen_interval =
 		nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
 	params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+		params.ht_capa =
+			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
 
 	if (parse_station_flags(info->attrs[NL80211_ATTR_STA_FLAGS],
 				&params.station_flags))
-- 
cgit v1.2.3


From 849e0576a76bc421aacd782f97948856f487726c Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Tue, 26 Aug 2008 11:57:57 -0300
Subject: rfkill: use strict_strtoul (v2)

Switch sysfs parsing to something that actually works properly.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 47e0b2d232e..173d039d990 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -402,12 +402,16 @@ static ssize_t rfkill_state_store(struct device *dev,
 				  const char *buf, size_t count)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
-	unsigned int state = simple_strtoul(buf, NULL, 0);
+	unsigned long state;
 	int error;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
+	error = strict_strtoul(buf, 0, &state);
+	if (error)
+		return error;
+
 	/* RFKILL_STATE_HARD_BLOCKED is illegal here... */
 	if (state != RFKILL_STATE_UNBLOCKED &&
 	    state != RFKILL_STATE_SOFT_BLOCKED)
@@ -435,7 +439,8 @@ static ssize_t rfkill_claim_store(struct device *dev,
 				  const char *buf, size_t count)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
-	bool claim = !!simple_strtoul(buf, NULL, 0);
+	unsigned long claim_tmp;
+	bool claim;
 	int error;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -444,6 +449,11 @@ static ssize_t rfkill_claim_store(struct device *dev,
 	if (rfkill->user_claim_unsupported)
 		return -EOPNOTSUPP;
 
+	error = strict_strtoul(buf, 0, &claim_tmp);
+	if (error)
+		return error;
+	claim = !!claim_tmp;
+
 	/*
 	 * Take the global lock to make sure the kernel is not in
 	 * the middle of rfkill_switch_all
-- 
cgit v1.2.3


From 01b510b9c29caf2134c31d2bc8c2c5cc73987eb6 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@nokia.com>
Date: Tue, 26 Aug 2008 11:57:58 -0300
Subject: rfkill: add missing line break

Trivial patch adding a missing line break on
rfkill_claim_show().

Signed-off-by: Felipe Balbi <felipe.balbi@nokia.com>
Acked-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.co>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 173d039d990..b630f358d67 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -431,7 +431,7 @@ static ssize_t rfkill_claim_show(struct device *dev,
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 
-	return sprintf(buf, "%d", rfkill->user_claim);
+	return sprintf(buf, "%d\n", rfkill->user_claim);
 }
 
 static ssize_t rfkill_claim_store(struct device *dev,
-- 
cgit v1.2.3


From f745ba03a12a1c4b98a88a96ab39d9b58ac677a2 Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Tue, 26 Aug 2008 11:57:59 -0300
Subject: rfkill: add WARN and BUG_ON paranoia (v2)

BUG_ON() and WARN() the heck out of buggy drivers calling into the rfkill
subsystem.

Also switch from WARN_ON(1) to the new descriptive WARN().

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index b630f358d67..910699c4e04 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -76,6 +76,7 @@ static BLOCKING_NOTIFIER_HEAD(rfkill_notifier_list);
  */
 int register_rfkill_notifier(struct notifier_block *nb)
 {
+	BUG_ON(!nb);
 	return blocking_notifier_chain_register(&rfkill_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(register_rfkill_notifier);
@@ -91,6 +92,7 @@ EXPORT_SYMBOL_GPL(register_rfkill_notifier);
  */
 int unregister_rfkill_notifier(struct notifier_block *nb)
 {
+	BUG_ON(!nb);
 	return blocking_notifier_chain_unregister(&rfkill_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(unregister_rfkill_notifier);
@@ -202,6 +204,9 @@ static int rfkill_toggle_radio(struct rfkill *rfkill,
 		 * RFKILL_STATE_HARD_BLOCKED */
 		break;
 	default:
+		WARN(1, KERN_WARNING
+			"rfkill: illegal state %d passed as parameter "
+			"to rfkill_toggle_radio\n", state);
 		return -EINVAL;
 	}
 
@@ -236,7 +241,11 @@ static void __rfkill_switch_all(const enum rfkill_type type,
 {
 	struct rfkill *rfkill;
 
-	if (unlikely(state >= RFKILL_STATE_MAX))
+	if (WARN((state >= RFKILL_STATE_MAX || type >= RFKILL_TYPE_MAX),
+			KERN_WARNING
+			"rfkill: illegal state %d or type %d "
+			"passed as parameter to __rfkill_switch_all\n",
+			state, type))
 		return;
 
 	rfkill_global_states[type].current_state = state;
@@ -334,7 +343,11 @@ int rfkill_force_state(struct rfkill *rfkill, enum rfkill_state state)
 {
 	enum rfkill_state oldstate;
 
-	if (unlikely(state >= RFKILL_STATE_MAX))
+	BUG_ON(!rfkill);
+	if (WARN((state >= RFKILL_STATE_MAX),
+			KERN_WARNING
+			"rfkill: illegal state %d passed as parameter "
+			"to rfkill_force_state\n", state))
 		return -EINVAL;
 
 	mutex_lock(&rfkill->mutex);
@@ -593,10 +606,10 @@ static int rfkill_check_duplicity(const struct rfkill *rfkill)
 	memset(seen, 0, sizeof(seen));
 
 	list_for_each_entry(p, &rfkill_list, node) {
-		if (p == rfkill) {
-			WARN_ON(1);
+		if (WARN((p == rfkill), KERN_WARNING
+				"rfkill: illegal attempt to register "
+				"an already registered rfkill struct\n"))
 			return -EEXIST;
-		}
 		set_bit(p->type, seen);
 	}
 
@@ -664,6 +677,12 @@ struct rfkill * __must_check rfkill_allocate(struct device *parent,
 	struct rfkill *rfkill;
 	struct device *dev;
 
+	if (WARN((type >= RFKILL_TYPE_MAX),
+			KERN_WARNING
+			"rfkill: illegal type %d passed as parameter "
+			"to rfkill_allocate\n", type))
+		return NULL;
+
 	rfkill = kzalloc(sizeof(struct rfkill), GFP_KERNEL);
 	if (!rfkill)
 		return NULL;
@@ -736,11 +755,12 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 	struct device *dev = &rfkill->dev;
 	int error;
 
-	if (!rfkill->toggle_radio)
-		return -EINVAL;
-	if (rfkill->type >= RFKILL_TYPE_MAX)
-		return -EINVAL;
-	if (rfkill->state >= RFKILL_STATE_MAX)
+	if (WARN((!rfkill || !rfkill->toggle_radio ||
+			rfkill->type >= RFKILL_TYPE_MAX ||
+			rfkill->state >= RFKILL_STATE_MAX),
+			KERN_WARNING
+			"rfkill: attempt to register a "
+			"badly initialized rfkill struct\n"))
 		return -EINVAL;
 
 	snprintf(dev->bus_id, sizeof(dev->bus_id),
@@ -775,6 +795,7 @@ EXPORT_SYMBOL(rfkill_register);
  */
 void rfkill_unregister(struct rfkill *rfkill)
 {
+	BUG_ON(!rfkill);
 	device_del(&rfkill->dev);
 	rfkill_remove_switch(rfkill);
 	rfkill_led_trigger_unregister(rfkill);
@@ -811,9 +832,12 @@ int rfkill_set_default(enum rfkill_type type, enum rfkill_state state)
 {
 	int error;
 
-	if (type >= RFKILL_TYPE_MAX ||
-	    (state != RFKILL_STATE_SOFT_BLOCKED &&
-	     state != RFKILL_STATE_UNBLOCKED))
+	if (WARN((type >= RFKILL_TYPE_MAX ||
+			(state != RFKILL_STATE_SOFT_BLOCKED &&
+			 state != RFKILL_STATE_UNBLOCKED)),
+			KERN_WARNING
+			"rfkill: illegal state %d or type %d passed as "
+			"parameter to rfkill_set_default\n", state, type))
 		return -EINVAL;
 
 	mutex_lock(&rfkill_mutex);
-- 
cgit v1.2.3


From 15635744484d4255778fc641261be27179c51f9a Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Tue, 26 Aug 2008 11:58:00 -0300
Subject: rfkill: rename rfkill_mutex to rfkill_global_mutex

rfkill_mutex and rfkill->mutex are too easy to confuse with each other.

Rename rfkill_mutex to rfkill_global_mutex, so that they are easier to tell
apart with just one glance.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Cc: Michael Buesch <mb@bu3sch.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 910699c4e04..d5735799ccd 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -37,7 +37,7 @@ MODULE_DESCRIPTION("RF switch support");
 MODULE_LICENSE("GPL");
 
 static LIST_HEAD(rfkill_list);	/* list of registered rf switches */
-static DEFINE_MUTEX(rfkill_mutex);
+static DEFINE_MUTEX(rfkill_global_mutex);
 
 static unsigned int rfkill_default_state = RFKILL_STATE_UNBLOCKED;
 module_param_named(default_state, rfkill_default_state, uint, 0444);
@@ -234,7 +234,7 @@ static int rfkill_toggle_radio(struct rfkill *rfkill,
  * unless a specific switch is claimed by userspace (in which case,
  * that switch is left alone) or suspended.
  *
- * Caller must have acquired rfkill_mutex.
+ * Caller must have acquired rfkill_global_mutex.
  */
 static void __rfkill_switch_all(const enum rfkill_type type,
 				const enum rfkill_state state)
@@ -263,14 +263,14 @@ static void __rfkill_switch_all(const enum rfkill_type type,
  * @type: type of interfaces to be affected
  * @state: the new state
  *
- * Acquires rfkill_mutex and calls __rfkill_switch_all(@type, @state).
+ * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state).
  * Please refer to __rfkill_switch_all() for details.
  */
 void rfkill_switch_all(enum rfkill_type type, enum rfkill_state state)
 {
-	mutex_lock(&rfkill_mutex);
+	mutex_lock(&rfkill_global_mutex);
 	__rfkill_switch_all(type, state);
-	mutex_unlock(&rfkill_mutex);
+	mutex_unlock(&rfkill_global_mutex);
 }
 EXPORT_SYMBOL(rfkill_switch_all);
 
@@ -278,7 +278,7 @@ EXPORT_SYMBOL(rfkill_switch_all);
  * rfkill_epo - emergency power off all transmitters
  *
  * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED,
- * ignoring everything in its path but rfkill_mutex and rfkill->mutex.
+ * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex.
  *
  * The global state before the EPO is saved and can be restored later
  * using rfkill_restore_states().
@@ -288,7 +288,8 @@ void rfkill_epo(void)
 	struct rfkill *rfkill;
 	int i;
 
-	mutex_lock(&rfkill_mutex);
+	mutex_lock(&rfkill_global_mutex);
+
 	list_for_each_entry(rfkill, &rfkill_list, node) {
 		mutex_lock(&rfkill->mutex);
 		rfkill_toggle_radio(rfkill, RFKILL_STATE_SOFT_BLOCKED, 1);
@@ -300,7 +301,7 @@ void rfkill_epo(void)
 		rfkill_global_states[i].current_state =
 				RFKILL_STATE_SOFT_BLOCKED;
 	}
-	mutex_unlock(&rfkill_mutex);
+	mutex_unlock(&rfkill_global_mutex);
 }
 EXPORT_SYMBOL_GPL(rfkill_epo);
 
@@ -315,10 +316,11 @@ void rfkill_restore_states(void)
 {
 	int i;
 
-	mutex_lock(&rfkill_mutex);
+	mutex_lock(&rfkill_global_mutex);
+
 	for (i = 0; i < RFKILL_TYPE_MAX; i++)
 		__rfkill_switch_all(i, rfkill_global_states[i].default_state);
-	mutex_unlock(&rfkill_mutex);
+	mutex_unlock(&rfkill_global_mutex);
 }
 EXPORT_SYMBOL_GPL(rfkill_restore_states);
 
@@ -471,7 +473,7 @@ static ssize_t rfkill_claim_store(struct device *dev,
 	 * Take the global lock to make sure the kernel is not in
 	 * the middle of rfkill_switch_all
 	 */
-	error = mutex_lock_interruptible(&rfkill_mutex);
+	error = mutex_lock_interruptible(&rfkill_global_mutex);
 	if (error)
 		return error;
 
@@ -486,7 +488,7 @@ static ssize_t rfkill_claim_store(struct device *dev,
 		rfkill->user_claim = claim;
 	}
 
-	mutex_unlock(&rfkill_mutex);
+	mutex_unlock(&rfkill_global_mutex);
 
 	return error ? error : count;
 }
@@ -621,7 +623,7 @@ static int rfkill_add_switch(struct rfkill *rfkill)
 {
 	int error;
 
-	mutex_lock(&rfkill_mutex);
+	mutex_lock(&rfkill_global_mutex);
 
 	error = rfkill_check_duplicity(rfkill);
 	if (error < 0)
@@ -642,16 +644,16 @@ static int rfkill_add_switch(struct rfkill *rfkill)
 
 	error = 0;
 unlock_out:
-	mutex_unlock(&rfkill_mutex);
+	mutex_unlock(&rfkill_global_mutex);
 
 	return error;
 }
 
 static void rfkill_remove_switch(struct rfkill *rfkill)
 {
-	mutex_lock(&rfkill_mutex);
+	mutex_lock(&rfkill_global_mutex);
 	list_del_init(&rfkill->node);
-	mutex_unlock(&rfkill_mutex);
+	mutex_unlock(&rfkill_global_mutex);
 
 	mutex_lock(&rfkill->mutex);
 	rfkill_toggle_radio(rfkill, RFKILL_STATE_SOFT_BLOCKED, 1);
@@ -840,7 +842,7 @@ int rfkill_set_default(enum rfkill_type type, enum rfkill_state state)
 			"parameter to rfkill_set_default\n", state, type))
 		return -EINVAL;
 
-	mutex_lock(&rfkill_mutex);
+	mutex_lock(&rfkill_global_mutex);
 
 	if (!test_and_set_bit(type, rfkill_states_lockdflt)) {
 		rfkill_global_states[type].default_state = state;
@@ -848,7 +850,7 @@ int rfkill_set_default(enum rfkill_type type, enum rfkill_state state)
 	} else
 		error = -EPERM;
 
-	mutex_unlock(&rfkill_mutex);
+	mutex_unlock(&rfkill_global_mutex);
 	return error;
 }
 EXPORT_SYMBOL_GPL(rfkill_set_default);
-- 
cgit v1.2.3


From 2c10b32bf57db7ec6d4cca4c4aa3d86bacb01c8a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 2 Sep 2008 17:30:27 -0700
Subject: netlink: Remove compat API for nested attributes

Removes all _nested_compat() functions from the API. The prio qdisc
no longer requires them and netem has its own format anyway. Their
existance is only confusing.

Resend: Also remove the wrapper macro.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 18 ++++++++++++++++--
 net/sched/sch_prio.c  |  6 +-----
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 3781e55046d..a11959908d9 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -388,6 +388,20 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
 };
 
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		      const struct nla_policy *policy, int len)
+{
+	int nested_len = nla_len(nla) - NLA_ALIGN(len);
+
+	if (nested_len < 0)
+		return -EINVAL;
+	if (nested_len >= nla_attr_size(0))
+		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+				 nested_len, policy);
+	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+	return 0;
+}
+
 /* Parse netlink message to set options */
 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 {
@@ -399,8 +413,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 	if (opt == NULL)
 		return -EINVAL;
 
-	ret = nla_parse_nested_compat(tb, TCA_NETEM_MAX, opt, netem_policy,
-				      qopt, sizeof(*qopt));
+	qopt = nla_data(opt);
+	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 	if (ret < 0)
 		return ret;
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index a6697c686c7..504a78cdb71 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -254,16 +254,12 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
-	struct nlattr *nest;
 	struct tc_prio_qopt opt;
 
 	opt.bands = q->bands;
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
 
-	nest = nla_nest_compat_start(skb, TCA_OPTIONS, sizeof(opt), &opt);
-	if (nest == NULL)
-		goto nla_put_failure;
-	nla_nest_compat_end(skb, nest);
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 
 	return skb->len;
 
-- 
cgit v1.2.3


From ba1a6c7bc0ff33e405f5156dc8f4145437255f1f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Always generate a Reset in response to option errors

RFC4340 states that if a packet is received with an option error (such as a
Mandatory Option as the last byte of the option list), the endpoint should
repond with a Reset.

In the LISTEN and RESPOND states, the endpoint correctly reponds with Reset,
while in the REQUEST/OPEN states, packets with option errors are just ignored.

The packet sequence is as follows:

Case 1:

  Endpoint A                           Endpoint B
  (CLOSED)                             (CLOSED)

               <----------------       REQUEST

  RESPONSE     ----------------->      (*1)
  (with invalid option)
               <----------------       RESET
                                       (with Reset Code 5, "Option Error")

  (*1) currently just ignored, no Reset is sent

Case 2:

  Endpoint A                           Endpoint B
  (OPEN)                               (OPEN)

  DATA-ACK     ----------------->      (*2)
  (with invalid option)
               <----------------       RESET
                                       (with Reset Code 5, "Option Error")

  (*2) currently just ignored, no Reset is sent

This patch fixes the problem, by generating a Reset instead of silently
ignoring option errors.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 803933ab396..779d0ed9ae9 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -370,7 +370,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 
 	if (dccp_parse_options(sk, NULL, skb))
-		goto discard;
+		return 1;
 
 	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 		dccp_event_ack_recv(sk, skb);
@@ -610,7 +610,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		 * Step 8: Process options and mark acknowledgeable
 		 */
 		if (dccp_parse_options(sk, NULL, skb))
-			goto discard;
+			return 1;
 
 		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 			dccp_event_ack_recv(sk, skb);
-- 
cgit v1.2.3


From faf61c3319ea336ed47acd6ca86faaaa3a8f4937 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Silently ignore options with nonsensical lengths

This updates the option-parsing code with regard to RFC 4340, 5.8:
 "[..] options with nonsensical lengths (length byte less than two or more
  than the remaining space in the options portion of the header) MUST be
  ignored, and any option space following an option with nonsensical length
  MUST likewise be ignored."

Hence in the following cases erratic options will be ignored:
 1. The type byte of a multi-byte option is the last byte of the header
    options (i.e. effective option length of 1).
 2. The value of the length byte is less than the minimum 2. This has been
    changed from previously 3: although no multi-byte option with a length
    less than 3 yet exists (cf. table 3 in 5.8), a length of 2 is valid.
    (The switch-statement in dccp_parse has further per-option length checks.)
 3. The option length exceeds the length of the remaining option space.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index dc7c158a2f4..4284f085604 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -81,11 +81,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		/* Check if this isn't a single byte option */
 		if (opt > DCCPO_MAX_RESERVED) {
 			if (opt_ptr == opt_end)
-				goto out_invalid_option;
+				goto out_nonsensical_length;
 
 			len = *opt_ptr++;
-			if (len < 3)
-				goto out_invalid_option;
+			if (len < 2)
+				goto out_nonsensical_length;
 			/*
 			 * Remove the type and len fields, leaving
 			 * just the value size
@@ -95,7 +95,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			opt_ptr += len;
 
 			if (opt_ptr > opt_end)
-				goto out_invalid_option;
+				goto out_nonsensical_length;
 		}
 
 		/*
@@ -283,6 +283,8 @@ ignore_option:
 	if (mandatory)
 		goto out_invalid_option;
 
+out_nonsensical_length:
+	/* RFC 4340, 5.8: ignore option and all remaining option space */
 	return 0;
 
 out_invalid_option:
-- 
cgit v1.2.3


From eac7726bf5cd24440d84b166e0813668d1bf3224 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Fill in the Data fields for "Option Error" Resets

This updates the use of the `out_invalid_option' label, which produces a
Reset (code 5, "Option Error"), to fill in the  Data1...Data3 fields as
specified in RFC 4340, 5.6.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index 4284f085604..0809b63cb05 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -291,6 +291,9 @@ out_invalid_option:
 	DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
 	DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
+	DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
+	DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
+	DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
 	return -1;
 }
 
-- 
cgit v1.2.3


From 48816322ad4d9ce195aaddd10f0ce98c944af193 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Empty the write queue when disconnecting

dccp_disconnect() can be called due to several reasons:

 1. when the connection setup failed (inet_stream_connect());
 2. when shutting down (inet_shutdown(), inet_csk_listen_stop());
 3. when aborting the connection (dccp_close() with 0 linger time).

In case (1) the write queue is empty. This patch empties the write queue,
if in case (2) or (3) it was not yet empty.

This avoids triggering the write-queue BUG_TRAP in sk_stream_kill_queues()
later on.

It also seems natural to do: when breaking an association, to delete all
packets that were originally intended for the soon-disconnected end (compare
with call to tcp_write_queue_purge in tcp_disconnect()).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/proto.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 1ca3b26eed0..ae66473b11f 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -309,7 +309,9 @@ int dccp_disconnect(struct sock *sk, int flags)
 		sk->sk_err = ECONNRESET;
 
 	dccp_clear_xmit_timers(sk);
+
 	__skb_queue_purge(&sk->sk_receive_queue);
+	__skb_queue_purge(&sk->sk_write_queue);
 	if (sk->sk_send_head != NULL) {
 		__kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
-- 
cgit v1.2.3


From 432649916b0435b608fb3e1fcb97347ac294d38d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp: Toggle debug output without module unloading

This sets the sysfs permissions so that root can toggle the `debug'
parameter available for nearly every DCCP module. This is useful
since there are various module inter-dependencies. The debug flag
can now be toggled at runtime using

  echo 1 > /sys/module/dccp/parameters/dccp_debug
  echo 1 > /sys/module/dccp_ccid2/parameters/ccid2_debug
  echo 1 > /sys/module/dccp_ccid3/parameters/ccid3_debug
  echo 1 > /sys/module/dccp_tfrc_lib/parameters/tfrc_debug

The last is not very useful yet, since no code at the moment calls
the tfrc_debug() macro.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c    | 2 +-
 net/dccp/ccids/ccid3.c    | 2 +-
 net/dccp/ccids/lib/tfrc.c | 2 +-
 net/dccp/proto.c          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 8e958087421..9a430734530 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -783,7 +783,7 @@ static struct ccid_operations ccid2 = {
 };
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-module_param(ccid2_debug, bool, 0444);
+module_param(ccid2_debug, bool, 0644);
 MODULE_PARM_DESC(ccid2_debug, "Enable debug messages");
 #endif
 
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index f6756e0c9e6..3b8bd7ca676 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -963,7 +963,7 @@ static struct ccid_operations ccid3 = {
 };
 
 #ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-module_param(ccid3_debug, bool, 0444);
+module_param(ccid3_debug, bool, 0644);
 MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
 #endif
 
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
index 97ecec0a8e7..185916218e0 100644
--- a/net/dccp/ccids/lib/tfrc.c
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -10,7 +10,7 @@
 
 #ifdef CONFIG_IP_DCCP_TFRC_DEBUG
 int tfrc_debug;
-module_param(tfrc_debug, bool, 0444);
+module_param(tfrc_debug, bool, 0644);
 MODULE_PARM_DESC(tfrc_debug, "Enable debug messages");
 #endif
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ae66473b11f..d0bd3481976 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1030,7 +1030,7 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
 
 #ifdef CONFIG_IP_DCCP_DEBUG
 int dccp_debug;
-module_param(dccp_debug, bool, 0444);
+module_param(dccp_debug, bool, 0644);
 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
 
 EXPORT_SYMBOL_GPL(dccp_debug);
-- 
cgit v1.2.3


From 959fd992f05b7468bf30d759ac0c9fd0ef0fa80b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 23 Aug 2008 13:28:27 +0200
Subject: dccp ccid-3: Replace lazy BUG_ON with condition

The BUG_ON(w_tot == 0) only holds if there is no more than 1 loss interval in
the loss history. If there is only a single loss interval, the calc_i_mean()
routine need in fact not be called (RFC 3448, 6.3.1).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/lib/loss_interval.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index bcd6ac415bb..5b3ce0688c5 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -67,7 +67,10 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
 	u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
 	int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
 
-	for (i=0; i <= k; i++) {
+	if (k <= 0)
+		return;
+
+	for (i = 0; i <= k; i++) {
 		i_i = tfrc_lh_get_interval(lh, i);
 
 		if (i < k) {
@@ -78,7 +81,6 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
 			i_tot1 += i_i * tfrc_lh_weights[i-1];
 	}
 
-	BUG_ON(w_tot == 0);
 	lh->i_mean = max(i_tot0, i_tot1) / w_tot;
 }
 
-- 
cgit v1.2.3


From 5c7c9451f1f422b69bf0e161e471dd3976ecd408 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Basic data structure for feature negotiation

This patch prepares for the new and extended feature-negotiation routines.

The following feature-negotiation data structures are provided:
	* a container for the various (SP or NN) values,
	* symbolic state names to track feature states,
	* an entry struct which holds all current information together,
	* elementary functions to fill in and process these structures.

Entry structs are arranged as FIFO for the following reason: RFC 4340 specifies
that if multiple options of the same type are present, they are processed in the
order of their appearance in the packet; which means that this order needs to be
preserved in the local data structure (the later insertion code also respects
this order).

The struct list_head has been chosen for the following reasons: the most
frequent operations are
 * add new entry at tail (when receiving Change or setting socket options);
 * delete entry (when Confirm has been received);
 * deep copy of entire list (cloning from listening socket onto request socket).

The NN value has been set to 64 bit, which is a currently sufficient upper limit
(Sequence Window feature has 48 bit).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c | 14 ++++++++++++++
 net/dccp/feat.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 933a0ecf8d4..94a81b8e5ef 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -23,6 +23,20 @@
 
 #define DCCP_FEAT_SP_NOAGREE (-123)
 
+/* copy constructor, fval must not already contain allocated memory */
+static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
+{
+	fval->sp.len = len;
+	if (fval->sp.len > 0) {
+		fval->sp.vec = kmemdup(val, len, gfp_any());
+		if (fval->sp.vec == NULL) {
+			fval->sp.len = 0;
+			return -ENOBUFS;
+		}
+	}
+	return 0;
+}
+
 int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 		     u8 *val, u8 len, gfp_t gfp)
 {
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index e272222c7ac..94203c230c6 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -14,6 +14,66 @@
 #include <linux/types.h>
 #include "dccp.h"
 
+enum dccp_feat_type {
+	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
+	FEAT_AT_TX   = 2,	/* located at TX side of half-connection  */
+	FEAT_SP      = 4,	/* server-priority reconciliation (6.3.1) */
+	FEAT_NN	     = 8,	/* non-negotiable reconciliation (6.3.2)  */
+	FEAT_UNKNOWN = 0xFF	/* not understood or invalid feature	  */
+};
+
+enum dccp_feat_state {
+	FEAT_DEFAULT = 0,	/* using default values from 6.4 */
+	FEAT_INITIALISING,	/* feature is being initialised  */
+	FEAT_CHANGING,		/* Change sent but not confirmed yet */
+	FEAT_UNSTABLE,		/* local modification in state CHANGING */
+	FEAT_STABLE		/* both ends (think they) agree */
+};
+
+/**
+ * dccp_feat_val  -  Container for SP or NN feature values
+ * @nn:     single NN value
+ * @sp.vec: single SP value plus optional preference list
+ * @sp.len: length of @sp.vec in bytes
+ */
+typedef union {
+	u64 nn;
+	struct {
+		u8	*vec;
+		u8	len;
+	}   sp;
+} dccp_feat_val;
+
+/**
+ * struct feat_entry  -  Data structure to perform feature negotiation
+ * @feat_num: one of %dccp_feature_numbers
+ * @val: feature's current value (SP features may have preference list)
+ * @state: feature's current state
+ * @needs_mandatory: whether Mandatory options should be sent
+ * @needs_confirm: whether to send a Confirm instead of a Change
+ * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
+ * @is_local: feature location (1) or feature-remote (0)
+ * @node: list pointers, entries arranged in FIFO order
+ */
+struct dccp_feat_entry {
+	u8                      feat_num;
+	dccp_feat_val           val;
+	enum dccp_feat_state    state:8;
+	bool			needs_mandatory:1,
+				needs_confirm:1,
+				empty_confirm:1,
+				is_local:1;
+
+	struct list_head	node;
+};
+
+static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
+{
+	if (entry->needs_confirm)
+		return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
+	return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
+}
+
 #ifdef CONFIG_IP_DCCP_DEBUG
 extern const char *dccp_feat_typename(const u8 type);
 extern const char *dccp_feat_name(const u8 feat);
-- 
cgit v1.2.3


From b4eec206370b7154dc354dc30f0a3f02ea8468b2 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Implement lookup table for feature-negotiation information

A lookup table for feature-negotiation information, extracted from RFC 4340/42,
is provided by this patch. All currently known features can be found in this
table, along with their feature location, their default value, and type.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 94a81b8e5ef..d7468f70544 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -23,6 +23,80 @@
 
 #define DCCP_FEAT_SP_NOAGREE (-123)
 
+static const struct {
+	u8			feat_num;		/* DCCPF_xxx */
+	enum dccp_feat_type	rxtx;			/* RX or TX  */
+	enum dccp_feat_type	reconciliation;		/* SP or NN  */
+	u8			default_value;		/* as in 6.4 */
+/*
+ *    Lookup table for location and type of features (from RFC 4340/4342)
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ *  | Feature                  | Location | Reconc. | Initial |  Section  |
+ *  |                          | RX | TX  | SP | NN |  Value  | Reference |
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ *  | DCCPF_CCID               |    |  X  | X  |    |   2     | 10        |
+ *  | DCCPF_SHORT_SEQNOS       |    |  X  | X  |    |   0     |  7.6.1    |
+ *  | DCCPF_SEQUENCE_WINDOW    |    |  X  |    | X  | 100     |  7.5.2    |
+ *  | DCCPF_ECN_INCAPABLE      | X  |     | X  |    |   0     | 12.1      |
+ *  | DCCPF_ACK_RATIO          |    |  X  |    | X  |   2     | 11.3      |
+ *  | DCCPF_SEND_ACK_VECTOR    | X  |     | X  |    |   0     | 11.5      |
+ *  | DCCPF_SEND_NDP_COUNT     |    |  X  | X  |    |   0     |  7.7.2    |
+ *  | DCCPF_MIN_CSUM_COVER     | X  |     | X  |    |   0     |  9.2.1    |
+ *  | DCCPF_DATA_CHECKSUM      | X  |     | X  |    |   0     |  9.3.1    |
+ *  | DCCPF_SEND_LEV_RATE      | X  |     | X  |    |   0     | 4342/8.4  |
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ */
+} dccp_feat_table[] = {
+	{ DCCPF_CCID,		 FEAT_AT_TX, FEAT_SP, 2 },
+	{ DCCPF_SHORT_SEQNOS,	 FEAT_AT_TX, FEAT_SP, 0 },
+	{ DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100 },
+	{ DCCPF_ECN_INCAPABLE,	 FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_ACK_RATIO,	 FEAT_AT_TX, FEAT_NN, 2 },
+	{ DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_SEND_NDP_COUNT,  FEAT_AT_TX, FEAT_SP, 0 },
+	{ DCCPF_MIN_CSUM_COVER,  FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_DATA_CHECKSUM,	 FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_SEND_LEV_RATE,	 FEAT_AT_RX, FEAT_SP, 0 },
+};
+#define DCCP_FEAT_SUPPORTED_MAX		ARRAY_SIZE(dccp_feat_table)
+
+/**
+ * dccp_feat_index  -  Hash function to map feature number into array position
+ * Returns consecutive array index or -1 if the feature is not understood.
+ */
+static int dccp_feat_index(u8 feat_num)
+{
+	/* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
+	if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
+		return feat_num - 1;
+
+	/*
+	 * Other features: add cases for new feature types here after adding
+	 * them to the above table.
+	 */
+	switch (feat_num) {
+	case DCCPF_SEND_LEV_RATE:
+			return DCCP_FEAT_SUPPORTED_MAX - 1;
+	}
+	return -1;
+}
+
+static u8 dccp_feat_type(u8 feat_num)
+{
+	int idx = dccp_feat_index(feat_num);
+
+	if (idx < 0)
+		return FEAT_UNKNOWN;
+	return dccp_feat_table[idx].reconciliation;
+}
+
+static int dccp_feat_default_value(u8 feat_num)
+{
+	int idx = dccp_feat_index(feat_num);
+
+	return idx < 0 ? : dccp_feat_table[idx].default_value;
+}
+
 /* copy constructor, fval must not already contain allocated memory */
 static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
 {
@@ -37,6 +111,45 @@ static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
 	return 0;
 }
 
+static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
+{
+	if (unlikely(val == NULL))
+		return;
+	if (dccp_feat_type(feat_num) == FEAT_SP)
+		kfree(val->sp.vec);
+	memset(val, 0, sizeof(*val));
+}
+
+static struct dccp_feat_entry *
+	      dccp_feat_clone_entry(struct dccp_feat_entry const *original)
+{
+	struct dccp_feat_entry *new;
+	u8 type = dccp_feat_type(original->feat_num);
+
+	if (type == FEAT_UNKNOWN)
+		return NULL;
+
+	new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
+	if (new == NULL)
+		return NULL;
+
+	if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
+						      original->val.sp.vec,
+						      original->val.sp.len)) {
+		kfree(new);
+		return NULL;
+	}
+	return new;
+}
+
+static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
+{
+	if (entry != NULL) {
+		dccp_feat_val_destructor(entry->feat_num, &entry->val);
+		kfree(entry);
+	}
+}
+
 int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 		     u8 *val, u8 len, gfp_t gfp)
 {
@@ -653,6 +766,8 @@ const char *dccp_feat_name(const u8 feat)
 	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
 		return feature_names[DCCPF_RESERVED];
 
+	if (feat ==  DCCPF_SEND_LEV_RATE)
+		return "Send Loss Event Rate";
 	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
 		return "CCID-specific";
 
-- 
cgit v1.2.3


From 3001fc0569651f2d0c3b45adc991351471b0c382 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: List management for new feature negotiation

This adds list fields and list management functions for the new feature
negotiation implementation. The new code is kept in parallel to the old
code, until removed at the end of the patch set.

Thanks to Arnaldo for suggestions to improve the code.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index d7468f70544..2ec2cd11769 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -150,6 +150,135 @@ static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
 	}
 }
 
+/*
+ * List management functions
+ *
+ * Feature negotiation lists rely on and maintain the following invariants:
+ * - each feat_num in the list is known, i.e. we know its type and default value
+ * - each feat_num/is_local combination is unique (old entries are overwritten)
+ * - SP values are always freshly allocated
+ * - list is sorted in increasing order of feature number (faster lookup)
+ */
+static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
+						     u8 feat_num, bool is_local)
+{
+	struct dccp_feat_entry *entry;
+
+	list_for_each_entry(entry, fn_list, node)
+		if (entry->feat_num == feat_num && entry->is_local == is_local)
+			return entry;
+		else if (entry->feat_num > feat_num)
+			break;
+	return NULL;
+}
+
+/**
+ * dccp_feat_entry_new  -  Central list update routine (called by all others)
+ * @head:  list to add to
+ * @feat:  feature number
+ * @local: whether the local (1) or remote feature with number @feat is meant
+ * This is the only constructor and serves to ensure the above invariants.
+ */
+static struct dccp_feat_entry *
+	      dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
+{
+	struct dccp_feat_entry *entry;
+
+	list_for_each_entry(entry, head, node)
+		if (entry->feat_num == feat && entry->is_local == local) {
+			dccp_feat_val_destructor(entry->feat_num, &entry->val);
+			return entry;
+		} else if (entry->feat_num > feat) {
+			head = &entry->node;
+			break;
+		}
+
+	entry = kmalloc(sizeof(*entry), gfp_any());
+	if (entry != NULL) {
+		entry->feat_num = feat;
+		entry->is_local = local;
+		list_add_tail(&entry->node, head);
+	}
+	return entry;
+}
+
+/**
+ * dccp_feat_push_change  -  Add/overwrite a Change option in the list
+ * @fn_list: feature-negotiation list to update
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @needs_mandatory: whether to use Mandatory feature negotiation options
+ * @fval: pointer to NN/SP value to be inserted (will be copied)
+ */
+static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
+				 u8 mandatory, dccp_feat_val *fval)
+{
+	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+	if (new == NULL)
+		return -ENOMEM;
+
+	new->feat_num	     = feat;
+	new->is_local	     = local;
+	new->state	     = FEAT_INITIALISING;
+	new->needs_confirm   = 0;
+	new->empty_confirm   = 0;
+	new->val	     = *fval;
+	new->needs_mandatory = mandatory;
+
+	return 0;
+}
+
+/**
+ * dccp_feat_push_confirm  -  Add a Confirm entry to the FN list
+ * @fn_list: feature-negotiation list to add to
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is being confirmed
+ * @fval: pointer to NN/SP value to be inserted or NULL
+ * Returns 0 on success, a Reset code for further processing otherwise.
+ */
+static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
+				  dccp_feat_val *fval)
+{
+	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+	if (new == NULL)
+		return DCCP_RESET_CODE_TOO_BUSY;
+
+	new->feat_num	     = feat;
+	new->is_local	     = local;
+	new->state	     = FEAT_STABLE;	/* transition in 6.6.2 */
+	new->needs_confirm   = 1;
+	new->empty_confirm   = (fval == NULL);
+	new->val.nn	     = 0;		/* zeroes the whole structure */
+	if (!new->empty_confirm)
+		new->val     = *fval;
+	new->needs_mandatory = 0;
+
+	return 0;
+}
+
+static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
+{
+	return dccp_feat_push_confirm(fn_list, feat, local, NULL);
+}
+
+static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
+{
+	list_del(&entry->node);
+	dccp_feat_entry_destructor(entry);
+}
+
+void dccp_feat_list_purge(struct list_head *fn_list)
+{
+	struct dccp_feat_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, fn_list, node)
+		dccp_feat_entry_destructor(entry);
+	INIT_LIST_HEAD(fn_list);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
+
 int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 		     u8 *val, u8 len, gfp_t gfp)
 {
-- 
cgit v1.2.3


From 828755cee087e4a34f45d6c9db661ccd0631cc6d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Per-socket initialisation of feature negotiation

This provides feature-negotiation initialisation for both DCCP sockets and
DCCP request_sockets, to support feature negotiation during connection setup.

It also resolves a FIXME regarding the congestion control initialisation.

Thanks to Wei Yongjun for help with the IPv6 side of this patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h      |  3 ++-
 net/dccp/feat.c      | 19 +++++++++++++++++++
 net/dccp/feat.h      |  1 +
 net/dccp/input.c     |  2 --
 net/dccp/ipv4.c      |  3 ++-
 net/dccp/ipv6.c      |  3 ++-
 net/dccp/minisocks.c |  7 ++++++-
 net/dccp/proto.c     |  1 +
 8 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index b4bc6e095a0..ab096c06bba 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -252,7 +252,8 @@ extern const char *dccp_state_name(const int state);
 extern void dccp_set_state(struct sock *sk, const int state);
 extern void dccp_done(struct sock *sk);
 
-extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
+extern int  dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
+			    struct sk_buff const *skb);
 
 extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 2ec2cd11769..faade82856f 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -279,6 +279,25 @@ void dccp_feat_list_purge(struct list_head *fn_list)
 }
 EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
 
+/* generate @to as full clone of @from - @to must not contain any nodes */
+int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
+{
+	struct dccp_feat_entry *entry, *new;
+
+	INIT_LIST_HEAD(to);
+	list_for_each_entry(entry, from, node) {
+		new = dccp_feat_clone_entry(entry);
+		if (new == NULL)
+			goto cloning_failed;
+		list_add_tail(&new->node, to);
+	}
+	return 0;
+
+cloning_failed:
+	dccp_feat_list_purge(to);
+	return -ENOMEM;
+}
+
 int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 		     u8 *val, u8 len, gfp_t gfp)
 {
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 94203c230c6..7e953fd0a79 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -95,6 +95,7 @@ extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 				   u8 *val, u8 len);
 extern void dccp_feat_clean(struct dccp_minisock *dmsk);
 extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
+extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
 extern int  dccp_feat_init(struct dccp_minisock *dmsk);
 
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 779d0ed9ae9..3070015edc7 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -590,8 +590,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
 								    skb) < 0)
 				return 1;
-
-			/* FIXME: do congestion control initialization */
 			goto discard;
 		}
 		if (dh->dccph_type == DCCP_PKT_RESET)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 882c5c4de69..0ce84ea8911 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -595,7 +595,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	dccp_reqsk_init(req, skb);
+	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+		goto drop_and_free;
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 5e1ee0da2c4..33e8a1ea304 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -424,7 +424,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	dccp_reqsk_init(req, skb);
+	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+		goto drop_and_free;
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index b2804e2d1b8..e487133ae07 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -125,6 +125,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
 		newicsk->icsk_rto	    = DCCP_TIMEOUT_INIT;
 
+		INIT_LIST_HEAD(&newdp->dccps_featneg);
 		if (dccp_feat_clone(sk, newsk))
 			goto out_free;
 
@@ -304,7 +305,8 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
 
-void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
+int dccp_reqsk_init(struct request_sock *req,
+		    struct dccp_sock const *dp, struct sk_buff const *skb)
 {
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
@@ -312,6 +314,9 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
 	inet_rsk(req)->acked	  = 0;
 	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
+
+	/* inherit feature negotiation options from listening socket */
+	return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
 }
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index d0bd3481976..1cdf4ae9960 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -193,6 +193,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 
 	dccp_init_xmit_timers(sk);
 
+	INIT_LIST_HEAD(&dp->dccps_featneg);
 	/*
 	 * FIXME: We're hardcoding the CCID, and doing this at this point makes
 	 * the listening (master) sock get CCID control blocks, which is not
-- 
cgit v1.2.3


From 702083839b607f390dbed5d2304eb8fc5f4c85ac Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Cleanup routines for feature negotiation

This inserts the required de-allocation routines for memory allocated by
feature negotiation in the socket destructors, replacing dccp_feat_clean()
in one instance.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h  | 2 ++
 net/dccp/ipv4.c  | 1 +
 net/dccp/ipv6.c  | 1 +
 net/dccp/proto.c | 2 +-
 4 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index ab096c06bba..dee4a90886d 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -442,6 +442,8 @@ static inline int dccp_ack_pending(const struct sock *sk)
 	       inet_csk_ack_scheduled(sk);
 }
 
+extern void dccp_feat_list_purge(struct list_head *fn_list);
+
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
 extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
 extern int dccp_insert_option_elapsed_time(struct sock *sk,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 0ce84ea8911..b623f6b2548 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -545,6 +545,7 @@ out:
 
 static void dccp_v4_reqsk_destructor(struct request_sock *req)
 {
+	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
 	kfree(inet_rsk(req)->opt);
 }
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 33e8a1ea304..ad6212e0043 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -302,6 +302,7 @@ done:
 
 static void dccp_v6_reqsk_destructor(struct request_sock *req)
 {
+	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
 	if (inet6_rsk(req)->pktopts != NULL)
 		kfree_skb(inet6_rsk(req)->pktopts);
 }
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 1cdf4ae9960..dafcefd8659 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -268,7 +268,7 @@ void dccp_destroy_sock(struct sock *sk)
 	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
 
 	/* clean up feature negotiation state */
-	dccp_feat_clean(dmsk);
+	dccp_feat_list_purge(&dp->dccps_featneg);
 }
 
 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
-- 
cgit v1.2.3


From 5591d286281fdfb57914f5fad3ca001d44ce8fc6 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Limit feature negotiation to connection setup phase

This patch starts the new implementation of feature negotiation:
 1. Although it is theoretically possible to perform feature negotiation at any
    time (and RFC 4340 supports this), in practice this is prohibitively complex,
    as it requires to put traffic on hold for each new negotiation.
 2. As a byproduct of restricting feature negotiation to connection setup, the
    feature-negotiation retransmit timer is no longer required. This part is now
    mapped onto the protocol-level retransmission.
    Details indicating why timers are no longer needed can be found on
    http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/feature_negotiation/\
	                                      implementation_notes.html

This patch disables anytime negotiation, subsequent patches work out full
feature negotiation support for connection setup.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/feat.c    | 19 ++++++++-----------
 net/dccp/options.c | 18 ------------------
 net/dccp/timer.c   | 12 ------------
 3 files changed, 8 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index faade82856f..77ce2f6b031 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -6,6 +6,8 @@
  *
  *  ASSUMPTIONS
  *  -----------
+ *  o Feature negotiation is coordinated with connection setup (as in TCP), wild
+ *    changes of parameters of an established connection are not supported.
  *  o All currently known SP features have 1-byte quantities. If in the future
  *    extensions of RFCs 4340..42 define features with item lengths larger than
  *    one byte, a feature-specific extension of the code will be required.
@@ -652,6 +654,9 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
 	int rc;
 
+	/* Ignore Change requests other than during connection setup */
+	if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING)
+		return 0;
 	dccp_feat_debug(type, feature, *val);
 
 	/* figure out if it's SP or NN feature */
@@ -701,6 +706,9 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 	int found = 0;
 	int all_confirmed = 1;
 
+	/* Ignore Confirm options other than during connection setup */
+	if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING)
+		return 0;
 	dccp_feat_debug(type, feature, *val);
 
 	/* locate our change request */
@@ -735,17 +743,6 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 			all_confirmed = 0;
 	}
 
-	/* fix re-transmit timer */
-	/* XXX gotta make sure that no option negotiation occurs during
-	 * connection shutdown.  Consider that the CLOSEREQ is sent and timer is
-	 * on.  if all options are confirmed it might kill timer which should
-	 * remain alive until close is received.
-	 */
-	if (all_confirmed) {
-		dccp_pr_debug("clear feat negotiation timer %p\n", sk);
-		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
-	}
-
 	if (!found)
 		dccp_pr_debug("%s(%d, ...) never requested\n",
 			      dccp_feat_typename(type), feature);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 0809b63cb05..67a171a1268 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -489,7 +489,6 @@ static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
 
 static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct dccp_opt_pend *opt, *next;
 	int change = 0;
@@ -530,23 +529,6 @@ static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
-	/* Retransmit timer.
-	 * If this is the master listening sock, we don't set a timer on it.  It
-	 * should be fine because if the dude doesn't receive our RESPONSE
-	 * [which will contain the CHANGE] he will send another REQUEST which
-	 * will "retrnasmit" the change.
-	 */
-	if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
-		dccp_pr_debug("reset feat negotiation timer %p\n", sk);
-
-		/* XXX don't reset the timer on re-transmissions.  I.e. reset it
-		 * only when sending new stuff i guess.  Currently the timer
-		 * never backs off because on re-transmission it just resets it!
-		 */
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-					  inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
-	}
-
 	return 0;
 }
 
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 54b3c7e9e01..162d1e683c3 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -87,17 +87,6 @@ static void dccp_retransmit_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	/* retransmit timer is used for feature negotiation throughout
-	 * connection.  In this case, no packet is re-transmitted, but rather an
-	 * ack is generated and pending changes are placed into its options.
-	 */
-	if (sk->sk_send_head == NULL) {
-		dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
-		if (sk->sk_state == DCCP_OPEN)
-			dccp_send_ack(sk);
-		goto backoff;
-	}
-
 	/*
 	 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
 	 * sent, no need to retransmit, this sock is dead.
@@ -126,7 +115,6 @@ static void dccp_retransmit_timer(struct sock *sk)
 		return;
 	}
 
-backoff:
 	icsk->icsk_backoff++;
 
 	icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
-- 
cgit v1.2.3


From 86349c8d9c6892b57aff4549256ab1aa65aed0f0 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Registration routines for changing feature values

Two registration routines, for SP and NN features, are provided by this patch,
replacing a previous routine which was used for both feature types.

These are internal-only routines and therefore start with `__feat_register'.

It further exports the known limits of Sequence Window and Ack Ratio as symbolic
constants.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/ccids/ccid2.c |   6 +--
 net/dccp/feat.c        | 123 ++++++++++++++++++++++++++++++++++++++++---------
 net/dccp/feat.h        |  25 +++++++++-
 net/dccp/proto.c       |   2 +-
 4 files changed, 128 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9a430734530..c9ea19a4d85 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,7 +25,7 @@
 /*
  * This implementation should follow RFC 4341
  */
-
+#include "../feat.h"
 #include "../ccid.h"
 #include "../dccp.h"
 #include "ccid2.h"
@@ -147,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 		DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
 		val = max_ratio;
 	}
-	if (val > 0xFFFF)		/* RFC 4340, 11.3 */
-		val = 0xFFFF;
+	if (val > DCCPF_ACK_RATIO_MAX)
+		val = DCCPF_ACK_RATIO_MAX;
 
 	if (val == dp->dccps_l_ack_ratio)
 		return;
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 77ce2f6b031..b859722fba7 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -300,6 +300,95 @@ cloning_failed:
 	return -ENOMEM;
 }
 
+static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
+{
+	switch (feat_num) {
+	case DCCPF_ACK_RATIO:
+		return val <= DCCPF_ACK_RATIO_MAX;
+	case DCCPF_SEQUENCE_WINDOW:
+		return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
+	}
+	return 0;	/* feature unknown - so we can't tell */
+}
+
+/* check that SP values are within the ranges defined in RFC 4340 */
+static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
+{
+	switch (feat_num) {
+	case DCCPF_CCID:
+		return val == DCCPC_CCID2 || val == DCCPC_CCID3;
+	/* Type-check Boolean feature values: */
+	case DCCPF_SHORT_SEQNOS:
+	case DCCPF_ECN_INCAPABLE:
+	case DCCPF_SEND_ACK_VECTOR:
+	case DCCPF_SEND_NDP_COUNT:
+	case DCCPF_DATA_CHECKSUM:
+	case DCCPF_SEND_LEV_RATE:
+		return val < 2;
+	case DCCPF_MIN_CSUM_COVER:
+		return val < 16;
+	}
+	return 0;			/* feature unknown */
+}
+
+static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
+{
+	if (sp_list == NULL || sp_len < 1)
+		return 0;
+	while (sp_len--)
+		if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
+			return 0;
+	return 1;
+}
+
+/**
+ * __feat_register_nn  -  Register new NN value on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an NN feature from %dccp_feature_numbers
+ * @mandatory: use Mandatory option if 1
+ * @nn_val: value to register (restricted to 4 bytes)
+ * Note that NN features are local by definition (RFC 4340, 6.3.2).
+ */
+static int __feat_register_nn(struct list_head *fn, u8 feat,
+			      u8 mandatory, u64 nn_val)
+{
+	dccp_feat_val fval = { .nn = nn_val };
+
+	if (dccp_feat_type(feat) != FEAT_NN ||
+	    !dccp_feat_is_valid_nn_val(feat, nn_val))
+		return -EINVAL;
+
+	/* Don't bother with default values, they will be activated anyway. */
+	if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
+		return 0;
+
+	return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
+}
+
+/**
+ * __feat_register_sp  -  Register new SP value/list on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an SP feature from %dccp_feature_numbers
+ * @is_local: whether the local (1) or the remote (0) @feat is meant
+ * @mandatory: use Mandatory option if 1
+ * @sp_val: SP value followed by optional preference list
+ * @sp_len: length of @sp_val in bytes
+ */
+static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
+			      u8 mandatory, u8 const *sp_val, u8 sp_len)
+{
+	dccp_feat_val fval;
+
+	if (dccp_feat_type(feat) != FEAT_SP ||
+	    !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
+		return -EINVAL;
+
+	if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
+		return -ENOMEM;
+
+	return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
+}
+
 int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 		     u8 *val, u8 len, gfp_t gfp)
 {
@@ -836,42 +925,30 @@ out_clean:
 
 EXPORT_SYMBOL_GPL(dccp_feat_clone);
 
-static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat,
-			    u8 *val, u8 len)
-{
-	int rc = -ENOMEM;
-	u8 *copy = kmemdup(val, len, GFP_KERNEL);
-
-	if (copy != NULL) {
-		rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL);
-		if (rc)
-			kfree(copy);
-	}
-	return rc;
-}
-
-int dccp_feat_init(struct dccp_minisock *dmsk)
+int dccp_feat_init(struct sock *sk)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	int rc;
 
-	INIT_LIST_HEAD(&dmsk->dccpms_pending);
-	INIT_LIST_HEAD(&dmsk->dccpms_conf);
+	INIT_LIST_HEAD(&dmsk->dccpms_pending);	/* XXX no longer used */
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);	/* XXX no longer used */
 
 	/* CCID L */
-	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID,
-			      &dmsk->dccpms_tx_ccid, 1);
+	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0,
+				&dmsk->dccpms_tx_ccid, 1);
 	if (rc)
 		goto out;
 
 	/* CCID R */
-	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID,
-			      &dmsk->dccpms_rx_ccid, 1);
+	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0,
+				&dmsk->dccpms_rx_ccid, 1);
 	if (rc)
 		goto out;
 
 	/* Ack ratio */
-	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
-			      &dmsk->dccpms_ack_ratio, 1);
+	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
+				dmsk->dccpms_ack_ratio);
 out:
 	return rc;
 }
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 7e953fd0a79..9eefdb43783 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -14,6 +14,15 @@
 #include <linux/types.h>
 #include "dccp.h"
 
+/*
+ * Known limit values
+ */
+/* Ack Ratio takes 2-byte integer values (11.3) */
+#define DCCPF_ACK_RATIO_MAX	0xFFFF
+/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
+#define DCCPF_SEQ_WMIN		32
+#define DCCPF_SEQ_WMAX		0x3FFFFFFFFFFFull
+
 enum dccp_feat_type {
 	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
 	FEAT_AT_TX   = 2,	/* located at TX side of half-connection  */
@@ -74,6 +83,20 @@ static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
 	return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
 }
 
+/**
+ * struct ccid_dependency  -  Track changes resulting from choosing a CCID
+ * @dependent_feat: one of %dccp_feature_numbers
+ * @is_local: local (1) or remote (0) @dependent_feat
+ * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
+ * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
+ */
+struct ccid_dependency {
+	u8	dependent_feat;
+	bool	is_local:1,
+		is_mandatory:1;
+	u8	val;
+};
+
 #ifdef CONFIG_IP_DCCP_DEBUG
 extern const char *dccp_feat_typename(const u8 type);
 extern const char *dccp_feat_name(const u8 feat);
@@ -96,6 +119,6 @@ extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 extern void dccp_feat_clean(struct dccp_minisock *dmsk);
 extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
-extern int  dccp_feat_init(struct dccp_minisock *dmsk);
+extern int  dccp_feat_init(struct sock *sk);
 
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index dafcefd8659..01332fe7a99 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -202,7 +202,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	 * setsockopt(CCIDs-I-want/accept). -acme
 	 */
 	if (likely(ctl_sock_initialized)) {
-		int rc = dccp_feat_init(dmsk);
+		int rc = dccp_feat_init(sk);
 
 		if (rc)
 			return rc;
-- 
cgit v1.2.3


From 71bb49596bbf4e5a3328e1704d18604e822ba181 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Query supported CCIDs

This provides a data structure to record which CCIDs are locally supported
and three accessor functions:
 - a test function for internal use which is used to validate CCID requests
   made by the user;
 - a copy function so that the list can be used for feature-negotiation;
 - documented getsockopt() support so that the user can query capabilities.

The data structure is a table which is filled in at compile-time with the
list of available CCIDs (which in turn depends on the Kconfig choices).

Using the copy function for cloning the list of supported CCIDs is useful for
feature negotiation, since the negotiation is now with the full list of available
CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation
will not fail if the peer requests to use CCID3 instead of CCID2.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/ccid.c  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/ccid.h  |  5 +++++
 net/dccp/feat.c  |  4 ++++
 net/dccp/proto.c |  2 ++
 4 files changed, 59 insertions(+)

(limited to 'net')

diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 4809753d12a..f72ca83df55 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -13,6 +13,13 @@
 
 #include "ccid.h"
 
+static u8 builtin_ccids[] = {
+	DCCPC_CCID2,		/* CCID2 is supported by default */
+#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
+	DCCPC_CCID3,
+#endif
+};
+
 static struct ccid_operations *ccids[CCID_MAX];
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 static atomic_t ccids_lockct = ATOMIC_INIT(0);
@@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
 	}
 }
 
+/* check that up to @array_len members in @ccid_array are supported */
+bool ccid_support_check(u8 const *ccid_array, u8 array_len)
+{
+	u8 i, j, found;
+
+	for (i = 0, found = 0; i < array_len; i++, found = 0) {
+		for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++)
+			found = (ccid_array[i] == builtin_ccids[j]);
+		if (!found)
+			return false;
+	}
+	return true;
+}
+
+/**
+ * ccid_get_builtin_ccids  -  Provide copy of `builtin' CCID array
+ * @ccid_array: pointer to copy into
+ * @array_len: value to return length into
+ * This function allocates memory - caller must see that it is freed after use.
+ */
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
+{
+	*ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any());
+	if (*ccid_array == NULL)
+		return -ENOBUFS;
+	*array_len = ARRAY_SIZE(builtin_ccids);
+	return 0;
+}
+
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	if (len < sizeof(builtin_ccids))
+		return -EINVAL;
+
+	if (put_user(sizeof(builtin_ccids), optlen) ||
+	    copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids)))
+		return -EFAULT;
+	return 0;
+}
+
 int ccid_register(struct ccid_operations *ccid_ops)
 {
 	int err = -ENOBUFS;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index fdeae7b5731..259f5469d7d 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -103,6 +103,11 @@ static inline void *ccid_priv(const struct ccid *ccid)
 	return (void *)ccid->ccid_priv;
 }
 
+extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
+extern int  ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
+extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+					  char __user *, int __user *);
+
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index b859722fba7..9399554878c 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -383,6 +383,10 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
 	    !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
 		return -EINVAL;
 
+	/* Avoid negotiating alien CCIDs by only advertising supported ones */
+	if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
+		return -EOPNOTSUPP;
+
 	if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
 		return -ENOMEM;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 01332fe7a99..b4b10cbd888 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -649,6 +649,8 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_GET_CUR_MPS:
 		val = dp->dccps_mss_cache;
 		break;
+	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
+		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		val = dp->dccps_server_timewait;
 		break;
-- 
cgit v1.2.3


From 093e1f46cf162913d05e1d4eeb01baa3e297b683 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Resolve dependencies of features on choice of CCID

This provides a missing link in the code chain, as several features implicitly
depend and/or rely on the choice of CCID. Most notably, this is the Send Ack Vector
feature, but also Ack Ratio and Send Loss Event Rate (also taken care of).

For Send Ack Vector, the situation is as follows:
 * since CCID2 mandates the use of Ack Vectors, there is no point in allowing
   endpoints which use CCID2 to disable Ack Vector features such a connection;

 * a peer with a TX CCID of CCID2 will always expect Ack Vectors, and a peer
   with a RX CCID of CCID2 must always send Ack Vectors (RFC 4341, sec. 4);

 * for all other CCIDs, the use of (Send) Ack Vector is optional and thus
   negotiable. However, this implies that the code negotiating the use of Ack
   Vectors also supports it (i.e. is able to supply and to either parse or
   ignore received Ack Vectors). Since this is not the case (CCID-3 has no Ack
   Vector support), the use of Ack Vectors is here disabled, with a comment
   in the source code.

An analogous consideration arises for the Send Loss Event Rate feature,
since the CCID-3 implementation does not support the loss interval options
of RFC 4342. To make such use explicit, corresponding feature-negotiation
options are inserted which signal the use of the loss event rate option,
as it is used by the CCID3 code.

Lastly, the values of the Ack Ratio feature are matched to the choice of CCID.

The patch implements this as a function which is called after the user has
made all other registrations for changing default values of features.

The table is variable-length, the reserved (and hence for feature-negotiation
invalid, confirmed by considering section 19.4 of RFC 4340) feature number `0'
is used to mark the end of the table.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h   |   1 +
 net/dccp/feat.c   | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/output.c |   4 ++
 net/dccp/proto.c  |   3 +
 4 files changed, 168 insertions(+)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index dee4a90886d..1881527bdcd 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -442,6 +442,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
 	       inet_csk_ack_scheduled(sk);
 }
 
+extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
 extern void dccp_feat_list_purge(struct list_head *fn_list);
 
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 9399554878c..ed9f50b1c34 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -441,6 +441,166 @@ int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 
 EXPORT_SYMBOL_GPL(dccp_feat_change);
 
+/*
+ *	Tracking features whose value depend on the choice of CCID
+ *
+ * This is designed with an extension in mind so that a list walk could be done
+ * before activating any features. However, the existing framework was found to
+ * work satisfactorily up until now, the automatic verification is left open.
+ * When adding new CCIDs, add a corresponding dependency table here.
+ */
+static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
+{
+	static const struct ccid_dependency ccid2_dependencies[2][2] = {
+		/*
+		 * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
+		 * feature and Send Ack Vector is an RX feature, `is_local'
+		 * needs to be reversed.
+		 */
+		{	/* Dependencies of the receiver-side (remote) CCID2 */
+			{
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= true,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 }
+		},
+		{	/* Dependencies of the sender-side (local) CCID2 */
+			{
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= false,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 }
+		}
+	};
+	static const struct ccid_dependency ccid3_dependencies[2][5] = {
+		{	/*
+			 * Dependencies of the receiver-side CCID3
+			 */
+			{	/* locally disable Ack Vectors */
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= true,
+				.is_mandatory	= false,
+				.val		= 0
+			},
+			{	/* see below why Send Loss Event Rate is on */
+				.dependent_feat	= DCCPF_SEND_LEV_RATE,
+				.is_local	= true,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{	/* NDP Count is needed as per RFC 4342, 6.1.1 */
+				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
+				.is_local	= false,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 },
+		},
+		{	/*
+			 * CCID3 at the TX side: we request that the HC-receiver
+			 * will not send Ack Vectors (they will be ignored, so
+			 * Mandatory is not set); we enable Send Loss Event Rate
+			 * (Mandatory since the implementation does not support
+			 * the Loss Intervals option of RFC 4342, 8.6).
+			 * The last two options are for peer's information only.
+			*/
+			{
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= false,
+				.is_mandatory	= false,
+				.val		= 0
+			},
+			{
+				.dependent_feat	= DCCPF_SEND_LEV_RATE,
+				.is_local	= false,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{	/* this CCID does not support Ack Ratio */
+				.dependent_feat	= DCCPF_ACK_RATIO,
+				.is_local	= true,
+				.is_mandatory	= false,
+				.val		= 0
+			},
+			{	/* tell receiver we are sending NDP counts */
+				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
+				.is_local	= true,
+				.is_mandatory	= false,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 }
+		}
+	};
+	switch (ccid) {
+	case DCCPC_CCID2:
+		return ccid2_dependencies[is_local];
+	case DCCPC_CCID3:
+		return ccid3_dependencies[is_local];
+	default:
+		return NULL;
+	}
+}
+
+/**
+ * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
+ * @fn: feature-negotiation list to update
+ * @id: CCID number to track
+ * @is_local: whether TX CCID (1) or RX CCID (0) is meant
+ * This function needs to be called after registering all other features.
+ */
+static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
+{
+	const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
+	int i, rc = (table == NULL);
+
+	for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
+		if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
+			rc = __feat_register_sp(fn, table[i].dependent_feat,
+						    table[i].is_local,
+						    table[i].is_mandatory,
+						    &table[i].val, 1);
+		else
+			rc = __feat_register_nn(fn, table[i].dependent_feat,
+						    table[i].is_mandatory,
+						    table[i].val);
+	return rc;
+}
+
+/**
+ * dccp_feat_finalise_settings  -  Finalise settings before starting negotiation
+ * @dp: client or listening socket (settings will be inherited)
+ * This is called after all registrations (socket initialisation, sysctls, and
+ * sockopt calls), and before sending the first packet containing Change options
+ * (ie. client-Request or server-Response), to ensure internal consistency.
+ */
+int dccp_feat_finalise_settings(struct dccp_sock *dp)
+{
+	struct list_head *fn = &dp->dccps_featneg;
+	struct dccp_feat_entry *entry;
+	int i = 2, ccids[2] = { -1, -1 };
+
+	/*
+	 * Propagating CCIDs:
+	 * 1) not useful to propagate CCID settings if this host advertises more
+	 *    than one CCID: the choice of CCID  may still change - if this is
+	 *    the client, or if this is the server and the client sends
+	 *    singleton CCID values.
+	 * 2) since is that propagate_ccid changes the list, we defer changing
+	 *    the sorted list until after the traversal.
+	 */
+	list_for_each_entry(entry, fn, node)
+		if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
+			ccids[entry->is_local] = entry->val.sp.vec[0];
+	while (i--)
+		if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
+			return -1;
+	return 0;
+}
+
 static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index d06945c7d3d..dc96ecfbe6e 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -469,6 +469,10 @@ int dccp_connect(struct sock *sk)
 	struct sk_buff *skb;
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	/* do not connect if feature negotiation setup fails */
+	if (dccp_feat_finalise_settings(dccp_sk(sk)))
+		return -EPROTO;
+
 	dccp_connect_init(sk);
 
 	skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index b4b10cbd888..46cb3490d48 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -278,6 +278,9 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
 	struct dccp_sock *dp = dccp_sk(sk);
 
 	dp->dccps_role = DCCP_ROLE_LISTEN;
+	/* do not start to listen if feature negotiation setup fails */
+	if (dccp_feat_finalise_settings(dp))
+		return -EPROTO;
 	return inet_csk_listen_start(sk, backlog);
 }
 
-- 
cgit v1.2.3


From d4c8741c431e07cfc66eb2b4c3a17b8d4975d9c0 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Mechanism to resolve CCID dependencies

This adds a hook to resolve features whose value depends on the choice of
CCID. It is done at the server since it can only be done after the CCID
values have been negotiated; i.e. the client will add its CCID preference
list on the Change options sent in the Request, which will be reconciled
with the local preference list of the server.

The concept is documented on
http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/feature_negotiation/\
				implementation_notes.html#ccid_dependencies

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h   |  1 +
 net/dccp/feat.c   | 25 +++++++++++++++++++++++++
 net/dccp/output.c | 13 +++++++++----
 3 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 1881527bdcd..e656dafb5d9 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -443,6 +443,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
 }
 
 extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
+extern int  dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
 extern void dccp_feat_list_purge(struct list_head *fn_list);
 
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index ed9f50b1c34..6852960bb0a 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -601,6 +601,31 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp)
 	return 0;
 }
 
+/**
+ * dccp_feat_server_ccid_dependencies  -  Resolve CCID-dependent features
+ * It is the server which resolves the dependencies once the CCID has been
+ * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
+ */
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
+{
+	struct list_head *fn = &dreq->dreq_featneg;
+	struct dccp_feat_entry *entry;
+	u8 is_local, ccid;
+
+	for (is_local = 0; is_local <= 1; is_local++) {
+		entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
+
+		if (entry != NULL && !entry->empty_confirm)
+			ccid = entry->val.sp.vec[0];
+		else
+			ccid = dccp_feat_default_value(DCCPF_CCID);
+
+		if (dccp_feat_propagate_ccid(fn, ccid, is_local))
+			return -1;
+	}
+	return 0;
+}
+
 static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index dc96ecfbe6e..19a93d5433a 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -339,10 +339,12 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
 	DCCP_SKB_CB(skb)->dccpd_seq  = dreq->dreq_iss;
 
-	if (dccp_insert_options_rsk(dreq, skb)) {
-		kfree_skb(skb);
-		return NULL;
-	}
+	/* Resolve feature dependencies resulting from choice of CCID */
+	if (dccp_feat_server_ccid_dependencies(dreq))
+		goto response_failed;
+
+	if (dccp_insert_options_rsk(dreq, skb))
+		goto response_failed;
 
 	/* Build and checksum header */
 	dh = dccp_zeroed_hdr(skb, dccp_header_size);
@@ -363,6 +365,9 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	inet_rsk(req)->acked = 1;
 	DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
 	return skb;
+response_failed:
+	kfree_skb(skb);
+	return NULL;
 }
 
 EXPORT_SYMBOL_GPL(dccp_make_response);
-- 
cgit v1.2.3


From 668144f7b41716a9efe1b398e15ead32a26cd101 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Deprecate old setsockopt framework

The previous setsockopt interface, which passed socket options via struct
dccp_so_feat, is complicated/difficult to use. Continuing to support it leads to
ugly code since the old approach did not distinguish between NN and SP values.

This patch removes the old setsockopt interface and replaces it with two new
functions to register NN/SP values for feature negotiation. These are
essentially wrappers around the internal __feat_register functions, with
checking added to avoid
 * wrong usage (type);
 * changing values while the connection is in progress.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/feat.c  | 72 +++++++++++++++++++++-----------------------------------
 net/dccp/feat.h  |  5 ++--
 net/dccp/proto.c | 53 ++---------------------------------------
 3 files changed, 32 insertions(+), 98 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 6852960bb0a..44b10afd3fb 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -393,53 +393,35 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
 	return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
 }
 
-int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
-		     u8 *val, u8 len, gfp_t gfp)
-{
-	struct dccp_opt_pend *opt;
-
-	dccp_feat_debug(type, feature, *val);
-
-	if (len > 3) {
-		DCCP_WARN("invalid length %d\n", len);
+/**
+ * dccp_feat_register_sp  -  Register requests to change SP feature values
+ * @sk: client or listening socket
+ * @feat: one of %dccp_feature_numbers
+ * @is_local: whether the local (1) or remote (0) @feat is meant
+ * @list: array of preferred values, in descending order of preference
+ * @len: length of @list in bytes
+ */
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+			  u8 const *list, u8 len)
+{	 /* any changes must be registered before establishing the connection */
+	if (sk->sk_state != DCCP_CLOSED)
+		return -EISCONN;
+	if (dccp_feat_type(feat) != FEAT_SP)
 		return -EINVAL;
-	}
-	/* XXX add further sanity checks */
-
-	/* check if that feature is already being negotiated */
-	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
-		/* ok we found a negotiation for this option already */
-		if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
-			dccp_pr_debug("Replacing old\n");
-			/* replace */
-			BUG_ON(opt->dccpop_val == NULL);
-			kfree(opt->dccpop_val);
-			opt->dccpop_val	 = val;
-			opt->dccpop_len	 = len;
-			opt->dccpop_conf = 0;
-			return 0;
-		}
-	}
-
-	/* negotiation for a new feature */
-	opt = kmalloc(sizeof(*opt), gfp);
-	if (opt == NULL)
-		return -ENOMEM;
-
-	opt->dccpop_type = type;
-	opt->dccpop_feat = feature;
-	opt->dccpop_len	 = len;
-	opt->dccpop_val	 = val;
-	opt->dccpop_conf = 0;
-	opt->dccpop_sc	 = NULL;
-
-	BUG_ON(opt->dccpop_val == NULL);
-
-	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
-	return 0;
+	return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
+				  0, list, len);
 }
 
-EXPORT_SYMBOL_GPL(dccp_feat_change);
+/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
+int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
+{
+	/* any changes must be registered before establishing the connection */
+	if (sk->sk_state != DCCP_CLOSED)
+		return -EISCONN;
+	if (dccp_feat_type(feat) != FEAT_NN)
+		return -EINVAL;
+	return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
+}
 
 /*
  *	Tracking features whose value depend on the choice of CCID
@@ -1137,7 +1119,7 @@ int dccp_feat_init(struct sock *sk)
 
 	/* Ack ratio */
 	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
-				dmsk->dccpms_ack_ratio);
+				dp->dccps_l_ack_ratio);
 out:
 	return rc;
 }
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 9eefdb43783..2c92bd18e5f 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -110,8 +110,9 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 #define dccp_feat_debug(type, feat, val)
 #endif /* CONFIG_IP_DCCP_DEBUG */
 
-extern int  dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
-			     u8 *val, u8 len, gfp_t gfp);
+extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+				  u8 const *list, u8 len);
+extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
 				  u8 *val, u8 len);
 extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 46cb3490d48..108d56bd25c 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -470,44 +470,6 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
 	return 0;
 }
 
-/* byte 1 is feature.  the rest is the preference list */
-static int dccp_setsockopt_change(struct sock *sk, int type,
-				  struct dccp_so_feat __user *optval)
-{
-	struct dccp_so_feat opt;
-	u8 *val;
-	int rc;
-
-	if (copy_from_user(&opt, optval, sizeof(opt)))
-		return -EFAULT;
-	/*
-	 * rfc4340: 6.1. Change Options
-	 */
-	if (opt.dccpsf_len < 1)
-		return -EINVAL;
-
-	val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
-	if (!val)
-		return -ENOMEM;
-
-	if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
-		rc = -EFAULT;
-		goto out_free_val;
-	}
-
-	rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat,
-			      val, opt.dccpsf_len, GFP_KERNEL);
-	if (rc)
-		goto out_free_val;
-
-out:
-	return rc;
-
-out_free_val:
-	kfree(val);
-	goto out;
-}
-
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		char __user *optval, int optlen)
 {
@@ -530,20 +492,9 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		err = 0;
 		break;
 	case DCCP_SOCKOPT_CHANGE_L:
-		if (optlen != sizeof(struct dccp_so_feat))
-			err = -EINVAL;
-		else
-			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
-						     (struct dccp_so_feat __user *)
-						     optval);
-		break;
 	case DCCP_SOCKOPT_CHANGE_R:
-		if (optlen != sizeof(struct dccp_so_feat))
-			err = -EINVAL;
-		else
-			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
-						     (struct dccp_so_feat __user *)
-						     optval);
+		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+		err = 0;
 		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		if (dp->dccps_role != DCCP_ROLE_SERVER)
-- 
cgit v1.2.3


From 20f41eee82864e308a5499308a1722dc3181cc3a Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Feature negotiation for minimum-checksum-coverage

This provides feature negotiation for server minimum checksum coverage
which so far has been missing.

Since sender/receiver coverage values range only from 0...15, their
type has also been reduced in size from u16 to u4.

Feature-negotiation options are now generated for both sender and receiver
coverage, i.e. when the peer has `forgotten' to enable partial coverage
then feature negotiation will automatically enable (negotiate) the partial
coverage value for this connection.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/proto.c | 53 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 108d56bd25c..47b137a3fea 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -470,6 +470,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
 	return 0;
 }
 
+static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
+{
+	u8 *list, len;
+	int i, rc;
+
+	if (cscov < 0 || cscov > 15)
+		return -EINVAL;
+	/*
+	 * Populate a list of permissible values, in the range cscov...15. This
+	 * is necessary since feature negotiation of single values only works if
+	 * both sides incidentally choose the same value. Since the list starts
+	 * lowest-value first, negotiation will pick the smallest shared value.
+	 */
+	if (cscov == 0)
+		return 0;
+	len = 16 - cscov;
+
+	list = kmalloc(len, GFP_KERNEL);
+	if (list == NULL)
+		return -ENOBUFS;
+
+	for (i = 0; i < len; i++)
+		list[i] = cscov++;
+
+	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
+
+	if (rc == 0) {
+		if (rx)
+			dccp_sk(sk)->dccps_pcrlen = cscov;
+		else
+			dccp_sk(sk)->dccps_pcslen = cscov;
+	}
+	kfree(list);
+	return rc;
+}
+
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		char __user *optval, int optlen)
 {
@@ -502,20 +538,11 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		else
 			dp->dccps_server_timewait = (val != 0);
 		break;
-	case DCCP_SOCKOPT_SEND_CSCOV:	/* sender side, RFC 4340, sec. 9.2 */
-		if (val < 0 || val > 15)
-			err = -EINVAL;
-		else
-			dp->dccps_pcslen = val;
+	case DCCP_SOCKOPT_SEND_CSCOV:
+		err = dccp_setsockopt_cscov(sk, val, false);
 		break;
-	case DCCP_SOCKOPT_RECV_CSCOV:	/* receiver side, RFC 4340 sec. 9.2.1 */
-		if (val < 0 || val > 15)
-			err = -EINVAL;
-		else {
-			dp->dccps_pcrlen = val;
-			/* FIXME: add feature negotiation,
-			 * ChangeL(MinimumChecksumCoverage, val) */
-		}
+	case DCCP_SOCKOPT_RECV_CSCOV:
+		err = dccp_setsockopt_cscov(sk, val, true);
 		break;
 	default:
 		err = -ENOPROTOOPT;
-- 
cgit v1.2.3


From 17c30b40ed79e9f3955e884632c8f01e577b204a Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Deprecate Ack Ratio sysctl

This patch deprecates the Ack Ratio sysctl, since
 * Ack Ratio is entirely ignored by CCID-3 and CCID-4,
 * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1);
 * even if it would work in CCID-2, there is no point for a user to change it:
   - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2),
   - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts
     (since waiting for Acks which will never arrive in this window),
   - cwnd is not a user-configurable value.

The only reasonable place for Ack Ratio is to print it for debugging. It is
planned to do this later on, as part of e.g. dccp_probe.

With this patch Ack Ratio is now under full control of feature negotiation:
 * Ack Ratio is resolved as a dependency of the selected CCID;
 * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to
   the default of 2, following RFC 4340, 11.3 - "New connections start with Ack
   Ratio 2 for both endpoints";
 * what happens then is part of another patch set, since it concerns the
   dynamic update of Ack Ratio while the connection is in full flight.

Thanks to Tomasz Grobelny for discussion leading up to this patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 net/dccp/dccp.h      | 1 -
 net/dccp/minisocks.c | 1 -
 net/dccp/options.c   | 1 -
 net/dccp/sysctl.c    | 7 -------
 4 files changed, 10 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index e656dafb5d9..031ce350d3c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -98,7 +98,6 @@ extern int  sysctl_dccp_retries2;
 extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
-extern int  sysctl_dccp_feat_ack_ratio;
 extern int  sysctl_dccp_feat_send_ack_vector;
 extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index e487133ae07..ee7f40f8ddf 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -47,7 +47,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk)
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
 	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
 	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
-	dmsk->dccpms_ack_ratio	     = sysctl_dccp_feat_ack_ratio;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 67a171a1268..515ad45013a 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -26,7 +26,6 @@
 int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_ack_ratio	      = DCCPF_INITIAL_ACK_RATIO;
 int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 21295993fdb..f6e54f433e2 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "ack_ratio",
-		.data		= &sysctl_dccp_feat_ack_ratio,
-		.maxlen		= sizeof(sysctl_dccp_feat_ack_ratio),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "send_ackvec",
 		.data		= &sysctl_dccp_feat_send_ack_vector,
-- 
cgit v1.2.3


From 73bbe095bbb9ce5f94d5475bad54c7ccd8573b1b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Tidy up setsockopt calls

This splits the setsockopt calls into two groups, depending on whether an
integer argument (val) is required and whether routines being called do
their own locking.

Some options (such as setting the CCID) use u8 rather than int, so that for
these the test with regard to integer-sizeof can not be used.

The second switch-case statement now only has those statements which need
locking and which make use of `val'.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reviewed-by: Eugene Teo <eugeneteo@kernel.sg>
---
 net/dccp/proto.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 47b137a3fea..e29bbf91405 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -512,7 +512,17 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	struct dccp_sock *dp = dccp_sk(sk);
 	int val, err = 0;
 
-	if (optlen < sizeof(int))
+	switch (optname) {
+	case DCCP_SOCKOPT_PACKET_SIZE:
+		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+		return 0;
+	case DCCP_SOCKOPT_CHANGE_L:
+	case DCCP_SOCKOPT_CHANGE_R:
+		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+		return 0;
+	}
+
+	if (optlen < (int)sizeof(int))
 		return -EINVAL;
 
 	if (get_user(val, (int __user *)optval))
@@ -523,15 +533,6 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 
 	lock_sock(sk);
 	switch (optname) {
-	case DCCP_SOCKOPT_PACKET_SIZE:
-		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
-		err = 0;
-		break;
-	case DCCP_SOCKOPT_CHANGE_L:
-	case DCCP_SOCKOPT_CHANGE_R:
-		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
-		err = 0;
-		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		if (dp->dccps_role != DCCP_ROLE_SERVER)
 			err = -EOPNOTSUPP;
@@ -548,8 +549,8 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		err = -ENOPROTOOPT;
 		break;
 	}
-
 	release_sock(sk);
+
 	return err;
 }
 
-- 
cgit v1.2.3


From fade756f18d42694e3acb00e3471ab43002cba16 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Set per-connection CCIDs via socket options

With this patch, TX/RX CCIDs can now be changed on a per-connection basis, which
overrides the defaults set by the global sysctl variables for TX/RX CCIDs.

To make full use of this facility, the remaining patches of this patch set are
needed, which track dependencies and activate negotiated feature values.

Note on the maximum number of CCIDs that can be registered:
-----------------------------------------------------------
The maximum number of CCIDs that can be registered on the socket is constrained
by the space in a Confirm/Change feature negotiation option.

The space in these in turn depends on the size of header options as defined
in RFC 4340, 5.8. Since this is a recurring constant, it has been moved from
ackvec.h into linux/dccp.h, clarifying its purpose.

Relative to this size, the maximum number of CCID identifiers that can be
present in a Confirm option (which always consumes 1 byte more than a Change
option, cf. 6.1) is 2 bytes less than the maximum TLV size: one for the
CCID-feature-type and one for the selected value.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c |  9 ++++-----
 net/dccp/ackvec.h |  5 ++---
 net/dccp/feat.h   |  2 ++
 net/dccp/proto.c  | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 42 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 1e8be246ad1..01e4d39fa23 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -12,7 +12,6 @@
 #include "ackvec.h"
 #include "dccp.h"
 
-#include <linux/dccp.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -68,7 +67,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
 	/* Figure out how many options do we need to represent the ackvec */
-	const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
+	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
 	u16 len = av->av_vec_len + 2 * nr_opts, i;
 	u32 elapsed_time;
 	const unsigned char *tail, *from;
@@ -100,8 +99,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	for (i = 0; i < nr_opts; ++i) {
 		int copylen = len;
 
-		if (len > DCCP_MAX_ACKVEC_OPT_LEN)
-			copylen = DCCP_MAX_ACKVEC_OPT_LEN;
+		if (len > DCCP_SINGLE_OPT_MAXLEN)
+			copylen = DCCP_SINGLE_OPT_MAXLEN;
 
 		*to++ = DCCPO_ACK_VECTOR_0;
 		*to++ = copylen + 2;
@@ -432,7 +431,7 @@ found:
 int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 		      u64 *ackno, const u8 opt, const u8 *value, const u8 len)
 {
-	if (len > DCCP_MAX_ACKVEC_OPT_LEN)
+	if (len > DCCP_SINGLE_OPT_MAXLEN)
 		return -1;
 
 	/* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index bcb64fb4ace..4ccee030524 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -11,15 +11,14 @@
  *	published by the Free Software Foundation.
  */
 
+#include <linux/dccp.h>
 #include <linux/compiler.h>
 #include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
-/* Read about the ECN nonce to see why it is 253 */
-#define DCCP_MAX_ACKVEC_OPT_LEN 253
 /* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
+#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
 
 #define DCCP_ACKVEC_STATE_RECEIVED	0
 #define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 2c92bd18e5f..b53b11717c4 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -22,6 +22,8 @@
 /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
 #define DCCPF_SEQ_WMIN		32
 #define DCCPF_SEQ_WMAX		0x3FFFFFFFFFFFull
+/* Maximum number of SP values that fit in a single (Confirm) option */
+#define DCCP_FEAT_MAX_SP_VALS	(DCCP_SINGLE_OPT_MAXLEN - 2)
 
 enum dccp_feat_type {
 	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e29bbf91405..2cd56df44d8 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -506,6 +506,36 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
 	return rc;
 }
 
+static int dccp_setsockopt_ccid(struct sock *sk, int type,
+				char __user *optval, int optlen)
+{
+	u8 *val;
+	int rc = 0;
+
+	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+		return -EINVAL;
+
+	val = kmalloc(optlen, GFP_KERNEL);
+	if (val == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(val, optval, optlen)) {
+		kfree(val);
+		return -EFAULT;
+	}
+
+	lock_sock(sk);
+	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+
+	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
+	release_sock(sk);
+
+	kfree(val);
+	return rc;
+}
+
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		char __user *optval, int optlen)
 {
@@ -520,6 +550,10 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_CHANGE_R:
 		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
 		return 0;
+	case DCCP_SOCKOPT_CCID:
+	case DCCP_SOCKOPT_RX_CCID:
+	case DCCP_SOCKOPT_TX_CCID:
+		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
 	}
 
 	if (optlen < (int)sizeof(int))
-- 
cgit v1.2.3


From c8041e264b3db6944d37b87969fbe6458cb30cfd Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: API to query the current TX/RX CCID

This provides function to query the current TX/RX CCID dynamically, without
reliance on the minisock value, using dynamic information available in the
currently loaded CCID module.

This query function is then used to
 (a) provide the getsockopt part for getting/setting CCIDs via sockopts;
 (b) replace the current test for "which CCID is in use" in probe.c.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/ccid.h  | 18 ++++++++++++++++++
 net/dccp/probe.c |  7 ++-----
 net/dccp/proto.c | 10 ++++++++++
 3 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 259f5469d7d..803343aed00 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -116,6 +116,24 @@ extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
 extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
 				   gfp_t gfp);
 
+static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
+{
+	struct ccid *ccid = dp->dccps_hc_rx_ccid;
+
+	if (ccid == NULL || ccid->ccid_ops == NULL)
+		return -1;
+	return ccid->ccid_ops->ccid_id;
+}
+
+static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
+{
+	struct ccid *ccid = dp->dccps_hc_tx_ccid;
+
+	if (ccid == NULL || ccid->ccid_ops == NULL)
+		return -1;
+	return ccid->ccid_ops->ccid_id;
+}
+
 extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
 extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
 
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 81368a7f537..9ca783d7467 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -74,14 +74,11 @@ static void printl(const char *fmt, ...)
 static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
 			 struct msghdr *msg, size_t size)
 {
-	const struct dccp_minisock *dmsk = dccp_msk(sk);
 	const struct inet_sock *inet = inet_sk(sk);
-	const struct ccid3_hc_tx_sock *hctx;
+	struct ccid3_hc_tx_sock *hctx = NULL;
 
-	if (dmsk->dccpms_tx_ccid == DCCPC_CCID3)
+	if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
 		hctx = ccid3_hc_tx_sk(sk);
-	else
-		hctx = NULL;
 
 	if (port == 0 || ntohs(inet->dport) == port ||
 	    ntohs(inet->sport) == port) {
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2cd56df44d8..6550452d59e 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -667,6 +667,16 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 		break;
 	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
 		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
+	case DCCP_SOCKOPT_TX_CCID:
+		val = ccid_get_current_tx_ccid(dp);
+		if (val < 0)
+			return -ENOPROTOOPT;
+		break;
+	case DCCP_SOCKOPT_RX_CCID:
+		val = ccid_get_current_rx_ccid(dp);
+		if (val < 0)
+			return -ENOPROTOOPT;
+		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		val = dp->dccps_server_timewait;
 		break;
-- 
cgit v1.2.3


From b9aaac1c538a9c86e8ef3be2579a13ff55580908 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Increase the scope of variable-length htonl/ntohl functions

This extends the scope of two available functions, encode|decode_value_var,
to work up to 6 (8) bytes, to match maximum requirements in the RFC.

These functions are going to be used both by general option processing and
feature negotiation code, hence declarations have been put into feat.h.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 net/dccp/feat.h    | 14 ++++++++++++++
 net/dccp/options.c | 21 ++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index b53b11717c4..90d16caf545 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -124,4 +124,18 @@ extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
 extern int  dccp_feat_init(struct sock *sk);
 
+/*
+ * Encoding variable-length options and their maximum length.
+ *
+ * This affects NN options (SP options are all u8) and other variable-length
+ * options (see table 3 in RFC 4340). The limit is currently given the Sequence
+ * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
+ * options consume less than 6 bytes (timestamps are 4 bytes).
+ * When updating this constant (e.g. due to new internet drafts / RFCs), make
+ * sure that you also update all code which refers to it.
+ */
+#define DCCP_OPTVAL_MAXLEN	6
+
+extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
+extern u64  dccp_decode_value_var(const u8 *bf, const u8 len);
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 515ad45013a..9cb0ff89405 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -29,16 +29,20 @@ int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
-static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
+u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
-	u32 value = 0;
+	u64 value = 0;
 
+	if (len >= DCCP_OPTVAL_MAXLEN)
+		value += ((u64)*bf++) << 40;
+	if (len > 4)
+		value += ((u64)*bf++) << 32;
 	if (len > 3)
-		value += *bf++ << 24;
+		value += ((u64)*bf++) << 24;
 	if (len > 2)
-		value += *bf++ << 16;
+		value += ((u64)*bf++) << 16;
 	if (len > 1)
-		value += *bf++ << 8;
+		value += ((u64)*bf++) << 8;
 	if (len > 0)
 		value += *bf;
 
@@ -298,9 +302,12 @@ out_invalid_option:
 
 EXPORT_SYMBOL_GPL(dccp_parse_options);
 
-static void dccp_encode_value_var(const u32 value, unsigned char *to,
-				  const unsigned int len)
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
 {
+	if (len >= DCCP_OPTVAL_MAXLEN)
+		*to++ = (value & 0xFF0000000000ull) >> 40;
+	if (len > 4)
+		*to++ = (value & 0xFF00000000ull) >> 32;
 	if (len > 3)
 		*to++ = (value & 0xFF000000) >> 24;
 	if (len > 2)
-- 
cgit v1.2.3


From d0440ee6f6903fcde6ed4efb88c910de1dfa18e5 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Support for Mandatory options

Support for Mandatory options is provided by this patch, which will
be used by subsequent feature-negotiation patches.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 net/dccp/feat.h    |  2 ++
 net/dccp/options.c | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 90d16caf545..602e0a7294b 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -138,4 +138,6 @@ extern int  dccp_feat_init(struct sock *sk);
 
 extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
 extern u64  dccp_decode_value_var(const u8 *bf, const u8 len);
+
+extern int  dccp_insert_option_mandatory(struct sk_buff *skb);
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 9cb0ff89405..676d53065de 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -467,6 +467,21 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
 	return 0;
 }
 
+/**
+ * dccp_insert_option_mandatory  -  Mandatory option (5.8.2)
+ * Note that since we are using skb_push, this function needs to be called
+ * _after_ inserting the option it is supposed to influence (stack order).
+ */
+int dccp_insert_option_mandatory(struct sk_buff *skb)
+{
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len++;
+	*skb_push(skb, 1) = DCCPO_MANDATORY;
+	return 0;
+}
+
 static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
 				u8 *val, u8 len)
 {
-- 
cgit v1.2.3


From cf9ddf73b9ba21a5cd6d3fcb0a45cfa9ec452033 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Header option insertion routine for feature-negotiation

The patch extends existing code:
 * Confirm options divide into the confirmed value plus an optional preference
   list for SP values. Previously only the preference list was echoed for SP
   values, now the confirmed value is added as per RFC 4340, 6.1;
 * length and sanity checks are added to avoid illegal memory (or NULL) access.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.h    |  2 ++
 net/dccp/options.c | 91 +++++++++++++++++++-----------------------------------
 2 files changed, 33 insertions(+), 60 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 602e0a7294b..8e863839ed9 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -140,4 +140,6 @@ extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
 extern u64  dccp_decode_value_var(const u8 *bf, const u8 len);
 
 extern int  dccp_insert_option_mandatory(struct sk_buff *skb);
+extern int  dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
+			       u8 *val, u8 len, bool repeat_first);
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 676d53065de..bfa1cb8f3ef 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -482,23 +482,46 @@ int dccp_insert_option_mandatory(struct sk_buff *skb)
 	return 0;
 }
 
-static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
-				u8 *val, u8 len)
+/**
+ * dccp_insert_fn_opt  -  Insert single Feature-Negotiation option into @skb
+ * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
+ * @feat: one out of %dccp_feature_numbers
+ * @val: NN value or SP array (preferred element first) to copy
+ * @len: true length of @val in bytes (excluding first element repetition)
+ * @repeat_first: whether to copy the first element of @val twice
+ * The last argument is used to construct Confirm options, where the preferred
+ * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
+ * lists are kept such that the preferred entry is always first, so we only need
+ * to copy twice, and avoid the overhead of cloning into a bigger array.
+ */
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
+		       u8 *val, u8 len, bool repeat_first)
 {
-	u8 *to;
+	u8 tot_len, *to;
 
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) {
-		DCCP_WARN("packet too small for feature %d option!\n", feat);
+	/* take the `Feature' field and possible repetition into account */
+	if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
+		DCCP_WARN("length %u for feature %u too large\n", len, feat);
 		return -1;
 	}
 
-	DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3;
+	if (unlikely(val == NULL || len == 0))
+		len = repeat_first = 0;
+	tot_len = 3 + repeat_first + len;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
+		DCCP_WARN("packet too small for feature %d option!\n", feat);
+		return -1;
+	}
+	DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
 
-	to    = skb_push(skb, len + 3);
+	to    = skb_push(skb, tot_len);
 	*to++ = type;
-	*to++ = len + 3;
+	*to++ = tot_len;
 	*to++ = feat;
 
+	if (repeat_first)
+		*to++ = *val;
 	if (len)
 		memcpy(to, val, len);
 
@@ -508,51 +531,6 @@ static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
 	return 0;
 }
 
-static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
-{
-	struct dccp_minisock *dmsk = dccp_msk(sk);
-	struct dccp_opt_pend *opt, *next;
-	int change = 0;
-
-	/* confirm any options [NN opts] */
-	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
-		dccp_insert_feat_opt(skb, opt->dccpop_type,
-				     opt->dccpop_feat, opt->dccpop_val,
-				     opt->dccpop_len);
-		/* fear empty confirms */
-		if (opt->dccpop_val)
-			kfree(opt->dccpop_val);
-		kfree(opt);
-	}
-	INIT_LIST_HEAD(&dmsk->dccpms_conf);
-
-	/* see which features we need to send */
-	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
-		/* see if we need to send any confirm */
-		if (opt->dccpop_sc) {
-			dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
-					     opt->dccpop_feat,
-					     opt->dccpop_sc->dccpoc_val,
-					     opt->dccpop_sc->dccpoc_len);
-
-			BUG_ON(!opt->dccpop_sc->dccpoc_val);
-			kfree(opt->dccpop_sc->dccpoc_val);
-			kfree(opt->dccpop_sc);
-			opt->dccpop_sc = NULL;
-		}
-
-		/* any option not confirmed, re-send it */
-		if (!opt->dccpop_conf) {
-			dccp_insert_feat_opt(skb, opt->dccpop_type,
-					     opt->dccpop_feat, opt->dccpop_val,
-					     opt->dccpop_len);
-			change++;
-		}
-	}
-
-	return 0;
-}
-
 /* The length of all options needs to be a multiple of 4 (5.8) */
 static void dccp_insert_option_padding(struct sk_buff *skb)
 {
@@ -589,13 +567,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 		dp->dccps_hc_rx_insert_options = 0;
 	}
 
-	/* Feature negotiation */
-	/* Data packets can't do feat negotiation */
-	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA &&
-	    DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK &&
-	    dccp_insert_options_feat(sk, skb))
-		return -1;
-
 	/*
 	 * Obtain RTT sample from Request/Response exchange.
 	 * This is currently used in CCID 3 initialisation.
-- 
cgit v1.2.3


From 0ef118a017919cd661cf294811d1889ac556ee80 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Insert feature-negotiation options into skb

This patch replaces the earlier insertion routine from options.c, so that
code specific to feature negotiation can remain in feat.c. This is possible
by calling a function already existing in options.c.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h |  2 ++
 net/dccp/feat.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 031ce350d3c..2e2a6f229ea 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -443,6 +443,8 @@ static inline int dccp_ack_pending(const struct sock *sk)
 
 extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
 extern int  dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
+extern int  dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
+				  struct sk_buff *skb);
 extern void dccp_feat_list_purge(struct list_head *fn_list);
 
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 44b10afd3fb..da686464462 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -300,6 +300,20 @@ cloning_failed:
 	return -ENOMEM;
 }
 
+/**
+ * dccp_feat_valid_nn_length  -  Enforce length constraints on NN options
+ * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
+ * incoming options are accepted as long as their values are valid.
+ */
+static u8 dccp_feat_valid_nn_length(u8 feat_num)
+{
+	if (feat_num == DCCPF_ACK_RATIO)	/* RFC 4340, 11.3 and 6.6.8 */
+		return 2;
+	if (feat_num == DCCPF_SEQUENCE_WINDOW)	/* RFC 4340, 7.5.2 and 6.5  */
+		return 6;
+	return 0;
+}
+
 static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
 {
 	switch (feat_num) {
@@ -341,6 +355,57 @@ static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
 	return 1;
 }
 
+/**
+ * dccp_feat_insert_opts  -  Generate FN options from current list state
+ * @skb: next sk_buff to be sent to the peer
+ * @dp: for client during handshake and general negotiation
+ * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
+ */
+int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
+			  struct sk_buff *skb)
+{
+	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+	struct dccp_feat_entry *pos, *next;
+	u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
+	bool rpt;
+
+	/* put entries into @skb in the order they appear in the list */
+	list_for_each_entry_safe_reverse(pos, next, fn, node) {
+		opt  = dccp_feat_genopt(pos);
+		type = dccp_feat_type(pos->feat_num);
+		rpt  = false;
+
+		if (pos->empty_confirm) {
+			len = 0;
+			ptr = NULL;
+		} else {
+			if (type == FEAT_SP) {
+				len = pos->val.sp.len;
+				ptr = pos->val.sp.vec;
+				rpt = pos->needs_confirm;
+			} else if (type == FEAT_NN) {
+				len = dccp_feat_valid_nn_length(pos->feat_num);
+				ptr = nn_in_nbo;
+				dccp_encode_value_var(pos->val.nn, ptr, len);
+			} else {
+				DCCP_BUG("unknown feature %u", pos->feat_num);
+				return -1;
+			}
+		}
+
+		if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
+			return -1;
+		if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
+			return -1;
+		/*
+		 * Enter CHANGING after transmitting the Change option (6.6.2).
+		 */
+		if (pos->state == FEAT_INITIALISING)
+			pos->state = FEAT_CHANGING;
+	}
+	return 0;
+}
+
 /**
  * __feat_register_nn  -  Register new NN value on socket
  * @fn: feature-negotiation list to register with
-- 
cgit v1.2.3


From f8a644c07e6f38b2c3cbaf99990e867d670d207b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Integrate feature-negotiation insertion code

The patch implements insertion of feature negotiation at the server (listening
and request socket) and the client (connecting socket).

In dccp_insert_options(), several statements have been grouped together now
to achieve (I hope) better efficiency by reducing the number of tests each
packet has to go through:
 - Ack Vectors are sent if the packet is neither a Data or a Request packet;
 - a previous issue is corrected - feature negotiation options are allowed
   on DataAck packets (5.8).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/options.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index bfa1cb8f3ef..0e277114cb2 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -554,11 +554,25 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 	    dccp_insert_option_ndp(sk, skb))
 		return -1;
 
-	if (!dccp_packet_without_ack(skb)) {
-		if (dmsk->dccpms_send_ack_vector &&
-		    dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
-		    dccp_insert_option_ackvec(sk, skb))
+	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
+
+		/* Feature Negotiation */
+		if (dccp_feat_insert_opts(dp, NULL, skb))
 			return -1;
+
+		if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
+			/*
+			 * Obtain RTT sample from Request/Response exchange.
+			 * This is currently used in CCID 3 initialisation.
+			 */
+			if (dccp_insert_option_timestamp(sk, skb))
+				return -1;
+
+		} else if (dmsk->dccpms_send_ack_vector	&&
+			   dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
+			   dccp_insert_option_ackvec(sk, skb)) {
+				return -1;
+		}
 	}
 
 	if (dp->dccps_hc_rx_insert_options) {
@@ -567,14 +581,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 		dp->dccps_hc_rx_insert_options = 0;
 	}
 
-	/*
-	 * Obtain RTT sample from Request/Response exchange.
-	 * This is currently used in CCID 3 initialisation.
-	 */
-	if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST &&
-	    dccp_insert_option_timestamp(sk, skb))
-		return -1;
-
 	if (dp->dccps_timestamp_echo != 0 &&
 	    dccp_insert_option_timestamp_echo(dp, NULL, skb))
 		return -1;
@@ -587,6 +593,9 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
 {
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
+	if (dccp_feat_insert_opts(NULL, dreq, skb))
+		return -1;
+
 	if (dreq->dreq_timestamp_echo != 0 &&
 	    dccp_insert_option_timestamp_echo(NULL, dreq, skb))
 		return -1;
-- 
cgit v1.2.3


From c664d4f4e2963ee355b1b0e77461eb844d1b288d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Preference list reconciliation

This provides two functions to
 * reconcile preference lists (with appropriate return codes) and
 * reorder the preference list if successful reconciliation changed the
   preferred value.

The patch also removes the old code for processing SP/NN Change options, since
new code to process these is mostly there already; related references have been
commented out.

The code for processing Change options follows in the next patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index da686464462..d53077bd40b 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -718,6 +718,76 @@ static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
 	return 0;
 }
 
+/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
+static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
+{
+	u8 c, s;
+
+	for (s = 0; s < slen; s++)
+		for (c = 0; c < clen; c++)
+			if (servlist[s] == clilist[c])
+				return servlist[s];
+	return -1;
+}
+
+/**
+ * dccp_feat_prefer  -  Move preferred entry to the start of array
+ * Reorder the @array_len elements in @array so that @preferred_value comes
+ * first. Returns >0 to indicate that @preferred_value does occur in @array.
+ */
+static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
+{
+	u8 i, does_occur = 0;
+
+	if (array != NULL) {
+		for (i = 0; i < array_len; i++)
+			if (array[i] == preferred_value) {
+				array[i] = array[0];
+				does_occur++;
+			}
+		if (does_occur)
+			array[0] = preferred_value;
+	}
+	return does_occur;
+}
+
+/**
+ * dccp_feat_reconcile  -  Reconcile SP preference lists
+ *  @fval: SP list to reconcile into
+ *  @arr: received SP preference list
+ *  @len: length of @arr in bytes
+ *  @is_server: whether this side is the server (and @fv is the server's list)
+ *  @reorder: whether to reorder the list in @fv after reconciling with @arr
+ * When successful, > 0 is returned and the reconciled list is in @fval.
+ * A value of 0 means that negotiation failed (no shared entry).
+ */
+static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
+			       bool is_server, bool reorder)
+{
+	int rc;
+
+	if (!fv->sp.vec || !arr) {
+		DCCP_CRIT("NULL feature value or array");
+		return 0;
+	}
+
+	if (is_server)
+		rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
+	else
+		rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
+
+	if (!reorder)
+		return rc;
+	if (rc < 0)
+		return 0;
+
+	/*
+	 * Reorder list: used for activating features and in dccp_insert_fn_opt.
+	 */
+	return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
+}
+
+#ifdef __this_is_the_old_framework_and_will_be_removed_later_in_a_subsequent_patch
 static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
 			       u8 *rpref, u8 rlen)
 {
@@ -913,6 +983,7 @@ static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 
 	return 0;
 }
+#endif /* (later) */
 
 static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
 				    u8 type, u8 feature)
@@ -988,12 +1059,14 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 	switch (feature) {
 	/* deal with SP features */
 	case DCCPF_CCID:
-		rc = dccp_feat_sp(sk, type, feature, val, len);
+		/* XXX Obsoleted by next patch
+		rc = dccp_feat_sp(sk, type, feature, val, len); */
 		break;
 
 	/* deal with NN features */
 	case DCCPF_ACK_RATIO:
-		rc = dccp_feat_nn(sk, type, feature, val, len);
+		/* XXX Obsoleted by next patch
+		rc = dccp_feat_nn(sk, type, feature, val, len); */
 		break;
 
 	/* XXX implement other features */
-- 
cgit v1.2.3


From 5a146b97d5e93db2df075c0d820f492bb996d0e3 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Process incoming Change feature-negotiation options

This adds/replaces code for processing incoming ChangeL/R options.
The main difference is that:
 * mandatory FN options are now interpreted inside the function
  (there are too many individual cases to do this externally);
 * the function returns an appropriate Reset code or 0,
   which is then used to fill in the data for the Reset packet.

Old code, which is no longer used or referenced, has been removed.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/feat.c    | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/dccp/feat.h    |   4 +-
 net/dccp/options.c |  23 +++------
 3 files changed, 155 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index d53077bd40b..01b4da7007b 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -983,7 +983,6 @@ static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 
 	return 0;
 }
-#endif /* (later) */
 
 static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
 				    u8 type, u8 feature)
@@ -1094,6 +1093,7 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 }
 
 EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
+#endif	/* (later) */
 
 int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 			   u8 *val, u8 len)
@@ -1234,6 +1234,150 @@ out_clean:
 
 EXPORT_SYMBOL_GPL(dccp_feat_clone);
 
+/**
+ * dccp_feat_change_recv  -  Process incoming ChangeL/R options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether the Change was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is the server (1) or the client (0)
+ */
+static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+				u8 feat, u8 *val, u8 len, const bool server)
+{
+	u8 defval, type = dccp_feat_type(feat);
+	const bool local = (opt == DCCPO_CHANGE_R);
+	struct dccp_feat_entry *entry;
+	dccp_feat_val fval;
+
+	if (len == 0 || type == FEAT_UNKNOWN)		/* 6.1 and 6.6.8 */
+		goto unknown_feature_or_value;
+
+	/*
+	 *	Negotiation of NN features: Change R is invalid, so there is no
+	 *	simultaneous negotiation; hence we do not look up in the list.
+	 */
+	if (type == FEAT_NN) {
+		if (local || len > sizeof(fval.nn))
+			goto unknown_feature_or_value;
+
+		/* 6.3.2: "The feature remote MUST accept any valid value..." */
+		fval.nn = dccp_decode_value_var(val, len);
+		if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+			goto unknown_feature_or_value;
+
+		return dccp_feat_push_confirm(fn, feat, local, &fval);
+	}
+
+	/*
+	 *	Unidirectional/simultaneous negotiation of SP features (6.3.1)
+	 */
+	entry = dccp_feat_list_lookup(fn, feat, local);
+	if (entry == NULL) {
+		/*
+		 * No particular preferences have been registered. We deal with
+		 * this situation by assuming that all valid values are equally
+		 * acceptable, and apply the following checks:
+		 * - if the peer's list is a singleton, we accept a valid value;
+		 * - if we are the server, we first try to see if the peer (the
+		 *   client) advertises the default value. If yes, we use it,
+		 *   otherwise we accept the preferred value;
+		 * - else if we are the client, we use the first list element.
+		 */
+		if (dccp_feat_clone_sp_val(&fval, val, 1))
+			return DCCP_RESET_CODE_TOO_BUSY;
+
+		if (len > 1 && server) {
+			defval = dccp_feat_default_value(feat);
+			if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
+				fval.sp.vec[0] = defval;
+		} else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
+			kfree(fval.sp.vec);
+			goto unknown_feature_or_value;
+		}
+
+		/* Treat unsupported CCIDs like invalid values */
+		if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
+			kfree(fval.sp.vec);
+			goto not_valid_or_not_known;
+		}
+
+		return dccp_feat_push_confirm(fn, feat, local, &fval);
+
+	} else if (entry->state == FEAT_UNSTABLE) {	/* 6.6.2 */
+		return 0;
+	}
+
+	if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
+		entry->empty_confirm = 0;
+	} else if (is_mandatory) {
+		return DCCP_RESET_CODE_MANDATORY_ERROR;
+	} else if (entry->state == FEAT_INITIALISING) {
+		/*
+		 * Failed simultaneous negotiation (server only): try to `save'
+		 * the connection by checking whether entry contains the default
+		 * value for @feat. If yes, send an empty Confirm to signal that
+		 * the received Change was not understood - which implies using
+		 * the default value.
+		 * If this also fails, we use Reset as the last resort.
+		 */
+		WARN_ON(!server);
+		defval = dccp_feat_default_value(feat);
+		if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
+			return DCCP_RESET_CODE_OPTION_ERROR;
+		entry->empty_confirm = 1;
+	}
+	entry->needs_confirm   = 1;
+	entry->needs_mandatory = 0;
+	entry->state	       = FEAT_STABLE;
+	return 0;
+
+unknown_feature_or_value:
+	if (!is_mandatory)
+		return dccp_push_empty_confirm(fn, feat, local);
+
+not_valid_or_not_known:
+	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+			    : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_parse_options  -  Process Feature-Negotiation Options
+ * @sk: for general use and used by the client during connection setup
+ * @dreq: used by the server during connection setup
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: value contents of @opt
+ * @len: length of @val in bytes
+ * Returns 0 on success, a Reset code for ending the connection otherwise.
+ */
+int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+			    u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+	bool server = false;
+
+	switch (sk->sk_state) {
+	/*
+	 *	Negotiation during connection setup
+	 */
+	case DCCP_LISTEN:
+		server = true;			/* fall through */
+	case DCCP_REQUESTING:
+		switch (opt) {
+		case DCCPO_CHANGE_L:
+		case DCCPO_CHANGE_R:
+			return dccp_feat_change_recv(fn, mandatory, opt, feat,
+						     val, len, server);
+		}
+	}
+	return 0;	/* ignore FN options in all other states */
+}
+
 int dccp_feat_init(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 8e863839ed9..ce97f3fe768 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -115,8 +115,8 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  u8 const *list, u8 len);
 extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
-extern int  dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
-				  u8 *val, u8 len);
+extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
+				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
 extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 				   u8 *val, u8 len);
 extern void dccp_feat_clean(struct dccp_minisock *dmsk);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 0e277114cb2..fb8466ea467 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -135,22 +135,13 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 				      (unsigned long long)opt_recv->dccpor_ndp);
 			break;
 		case DCCPO_CHANGE_L:
-			/* fall through */
 		case DCCPO_CHANGE_R:
 			if (pkt_type == DCCP_PKT_DATA)
 				break;
-			if (len < 2)
-				goto out_invalid_option;
-			rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
-						   len - 1);
-			/*
-			 * When there is a change error, change_recv is
-			 * responsible for dealing with it.  i.e. reply with an
-			 * empty confirm.
-			 * If the change was mandatory, then we need to die.
-			 */
-			if (rc && mandatory)
-				goto out_invalid_option;
+			rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
+						    *value, value + 1, len - 1);
+			if (rc)
+				goto out_featneg_failed;
 			break;
 		case DCCPO_CONFIRM_L:
 			/* fall through */
@@ -292,8 +283,10 @@ out_nonsensical_length:
 
 out_invalid_option:
 	DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
-	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
-	DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
+	rc = DCCP_RESET_CODE_OPTION_ERROR;
+out_featneg_failed:
+	DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
+	DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
 	DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
 	DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
 	DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
-- 
cgit v1.2.3


From d2150b7bff3d397692cf0dc890f198d23564de5f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Processing Confirm options

Analogous to the previous patch, this adds code to interpret incoming Confirm
feature-negotiation options. Both functions operate on the feature-negotiation
list of either the request_sock (server) or the dccp_sock (client).

Thanks to Wei Yongjun for pointing out that it is overly restrictive to check
the entire list of confirmed SP values.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c    | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/dccp/feat.h    |   2 --
 net/dccp/options.c |  16 ++-------
 3 files changed, 101 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 01b4da7007b..da3bbad9d50 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -99,6 +99,13 @@ static int dccp_feat_default_value(u8 feat_num)
 	return idx < 0 ? : dccp_feat_table[idx].default_value;
 }
 
+/* Test for "Req'd" feature (RFC 4340, 6.4) */
+static inline int dccp_feat_must_be_understood(u8 feat_num)
+{
+	return	feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
+		feat_num == DCCPF_SEQUENCE_WINDOW;
+}
+
 /* copy constructor, fval must not already contain allocated memory */
 static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
 {
@@ -1093,7 +1100,6 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 }
 
 EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
-#endif	/* (later) */
 
 int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 			   u8 *val, u8 len)
@@ -1148,6 +1154,7 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 }
 
 EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
+#endif	/* (later) */
 
 void dccp_feat_clean(struct dccp_minisock *dmsk)
 {
@@ -1343,6 +1350,93 @@ not_valid_or_not_known:
 			    : DCCP_RESET_CODE_OPTION_ERROR;
 }
 
+/**
+ * dccp_feat_confirm_recv  -  Process received Confirm options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is server (1) or client (0)
+ */
+static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+				 u8 feat, u8 *val, u8 len, const bool server)
+{
+	u8 *plist, plen, type = dccp_feat_type(feat);
+	const bool local = (opt == DCCPO_CONFIRM_R);
+	struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
+
+	if (entry == NULL) {	/* nothing queued: ignore or handle error */
+		if (is_mandatory && type == FEAT_UNKNOWN)
+			return DCCP_RESET_CODE_MANDATORY_ERROR;
+
+		if (!local && type == FEAT_NN)		/* 6.3.2 */
+			goto confirmation_failed;
+		return 0;
+	}
+
+	if (entry->state != FEAT_CHANGING)		/* 6.6.2 */
+		return 0;
+
+	if (len == 0) {
+		if (dccp_feat_must_be_understood(feat))	/* 6.6.7 */
+			goto confirmation_failed;
+		/*
+		 * Empty Confirm during connection setup: this means reverting
+		 * to the `old' value, which in this case is the default. Since
+		 * we handle default values automatically when no other values
+		 * have been set, we revert to the old value by removing this
+		 * entry from the list.
+		 */
+		dccp_feat_list_pop(entry);
+		return 0;
+	}
+
+	if (type == FEAT_NN) {
+		if (len > sizeof(entry->val.nn))
+			goto confirmation_failed;
+
+		if (entry->val.nn == dccp_decode_value_var(val, len))
+			goto confirmation_succeeded;
+
+		DCCP_WARN("Bogus Confirm for non-existing value\n");
+		goto confirmation_failed;
+	}
+
+	/*
+	 * Parsing SP Confirms: the first element of @val is the preferred
+	 * SP value which the peer confirms, the remainder depends on @len.
+	 * Note that only the confirmed value need to be a valid SP value.
+	 */
+	if (!dccp_feat_is_valid_sp_val(feat, *val))
+		goto confirmation_failed;
+
+	if (len == 1) {		/* peer didn't supply a preference list */
+		plist = val;
+		plen  = len;
+	} else {		/* preferred value + preference list */
+		plist = val + 1;
+		plen  = len - 1;
+	}
+
+	/* Check whether the peer got the reconciliation right (6.6.8) */
+	if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
+		DCCP_WARN("Confirm selected the wrong value %u\n", *val);
+		return DCCP_RESET_CODE_OPTION_ERROR;
+	}
+	entry->val.sp.vec[0] = *val;
+
+confirmation_succeeded:
+	entry->state = FEAT_STABLE;
+	return 0;
+
+confirmation_failed:
+	DCCP_WARN("Confirmation failed\n");
+	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+			    : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
 /**
  * dccp_feat_parse_options  -  Process Feature-Negotiation Options
  * @sk: for general use and used by the client during connection setup
@@ -1373,6 +1467,10 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		case DCCPO_CHANGE_R:
 			return dccp_feat_change_recv(fn, mandatory, opt, feat,
 						     val, len, server);
+		case DCCPO_CONFIRM_R:
+		case DCCPO_CONFIRM_L:
+			return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
+						      val, len, server);
 		}
 	}
 	return 0;	/* ignore FN options in all other states */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index ce97f3fe768..618bed935b3 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -117,8 +117,6 @@ extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
 				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
-extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
-				   u8 *val, u8 len);
 extern void dccp_feat_clean(struct dccp_minisock *dmsk);
 extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index fb8466ea467..3a9a22f0ac1 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -134,26 +134,14 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
 				      (unsigned long long)opt_recv->dccpor_ndp);
 			break;
-		case DCCPO_CHANGE_L:
-		case DCCPO_CHANGE_R:
-			if (pkt_type == DCCP_PKT_DATA)
+		case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
+			if (pkt_type == DCCP_PKT_DATA)      /* RFC 4340, 6 */
 				break;
 			rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
 						    *value, value + 1, len - 1);
 			if (rc)
 				goto out_featneg_failed;
 			break;
-		case DCCPO_CONFIRM_L:
-			/* fall through */
-		case DCCPO_CONFIRM_R:
-			if (pkt_type == DCCP_PKT_DATA)
-				break;
-			if (len < 2)	/* FIXME this disallows empty confirm */
-				goto out_invalid_option;
-			if (dccp_feat_confirm_recv(sk, opt, *value,
-						   value + 1, len - 1))
-				goto out_invalid_option;
-			break;
 		case DCCPO_ACK_VECTOR_0:
 		case DCCPO_ACK_VECTOR_1:
 			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
-- 
cgit v1.2.3


From c926c6aed3e444e8c88a768f063b2de8fd6ae760 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Feature activation handlers

This patch provides the post-processing of feature negotiation state, after
the negotiation has completed.

To this purpose, handlers are used and added to the dccp_feat_table. Each
handler is passed a boolean flag whether the RX or TX side of the feature
is meant.

Several handlers are provided already, new handlers can easily be added.

The initialisation is now fully dynamic, i.e. CCIDs are activated only
after the feature negotiation. The integration of this dynamic activation
is done in the subsequent patches.

Thanks to Wei Yongjun for pointing out the necessity of skipping over empty
Confirm options while copying the negotiated feature values.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h |   1 +
 net/dccp/feat.c | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 204 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 2e2a6f229ea..1baed789bcc 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -445,6 +445,7 @@ extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
 extern int  dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
 extern int  dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
 				  struct sk_buff *skb);
+extern int  dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
 extern void dccp_feat_list_purge(struct list_head *fn_list);
 
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index da3bbad9d50..f78bd357b5b 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -25,11 +25,101 @@
 
 #define DCCP_FEAT_SP_NOAGREE (-123)
 
+/*
+ * Feature activation handlers.
+ *
+ * These all use an u64 argument, to provide enough room for NN/SP features. At
+ * this stage the negotiated values have been checked to be within their range.
+ */
+static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any());
+
+	if (new_ccid == NULL)
+		return -ENOMEM;
+
+	if (rx) {
+		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+		dp->dccps_hc_rx_ccid = new_ccid;
+	} else {
+		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+		dp->dccps_hc_tx_ccid = new_ccid;
+	}
+	return 0;
+}
+
+static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
+{
+	if (!rx)
+		dccp_msk(sk)->dccpms_sequence_window = seq_win;
+	return 0;
+}
+
+static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
+{
+	if (rx)
+		dccp_sk(sk)->dccps_r_ack_ratio = ratio;
+	else
+		dccp_sk(sk)->dccps_l_ack_ratio = ratio;
+	return 0;
+}
+
+static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx) {
+		if (enable && dp->dccps_hc_rx_ackvec == NULL) {
+			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
+			if (dp->dccps_hc_rx_ackvec == NULL)
+				return -ENOMEM;
+		} else if (!enable) {
+			dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+			dp->dccps_hc_rx_ackvec = NULL;
+		}
+	}
+	return 0;
+}
+
+static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
+{
+	if (!rx)
+		dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0);
+	return 0;
+}
+
+/*
+ * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
+ * `rx' holds when the sending peer informs about his partial coverage via a
+ * ChangeR() option. In the other case, we are the sender and the receiver
+ * announces its coverage via ChangeL() options. The policy here is to honour
+ * such communication by enabling the corresponding partial coverage - but only
+ * if it has not been set manually before; the warning here means that all
+ * packets will be dropped.
+ */
+static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx)
+		dp->dccps_pcrlen = cscov;
+	else {
+		if (dp->dccps_pcslen == 0)
+			dp->dccps_pcslen = cscov;
+		else if (cscov > dp->dccps_pcslen)
+			DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
+				  dp->dccps_pcslen, (u8)cscov);
+	}
+	return 0;
+}
+
 static const struct {
 	u8			feat_num;		/* DCCPF_xxx */
 	enum dccp_feat_type	rxtx;			/* RX or TX  */
 	enum dccp_feat_type	reconciliation;		/* SP or NN  */
 	u8			default_value;		/* as in 6.4 */
+	int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
 /*
  *    Lookup table for location and type of features (from RFC 4340/4342)
  *  +--------------------------+----+-----+----+----+---------+-----------+
@@ -49,16 +139,16 @@ static const struct {
  *  +--------------------------+----+-----+----+----+---------+-----------+
  */
 } dccp_feat_table[] = {
-	{ DCCPF_CCID,		 FEAT_AT_TX, FEAT_SP, 2 },
-	{ DCCPF_SHORT_SEQNOS,	 FEAT_AT_TX, FEAT_SP, 0 },
-	{ DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100 },
-	{ DCCPF_ECN_INCAPABLE,	 FEAT_AT_RX, FEAT_SP, 0 },
-	{ DCCPF_ACK_RATIO,	 FEAT_AT_TX, FEAT_NN, 2 },
-	{ DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0 },
-	{ DCCPF_SEND_NDP_COUNT,  FEAT_AT_TX, FEAT_SP, 0 },
-	{ DCCPF_MIN_CSUM_COVER,  FEAT_AT_RX, FEAT_SP, 0 },
-	{ DCCPF_DATA_CHECKSUM,	 FEAT_AT_RX, FEAT_SP, 0 },
-	{ DCCPF_SEND_LEV_RATE,	 FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_CCID,		 FEAT_AT_TX, FEAT_SP, 2,   dccp_hdlr_ccid     },
+	{ DCCPF_SHORT_SEQNOS,	 FEAT_AT_TX, FEAT_SP, 0,   NULL },
+	{ DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win  },
+	{ DCCPF_ECN_INCAPABLE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
+	{ DCCPF_ACK_RATIO,	 FEAT_AT_TX, FEAT_NN, 2,   dccp_hdlr_ack_ratio},
+	{ DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_ackvec   },
+	{ DCCPF_SEND_NDP_COUNT,  FEAT_AT_TX, FEAT_SP, 0,   dccp_hdlr_ndp      },
+	{ DCCPF_MIN_CSUM_COVER,  FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_min_cscov},
+	{ DCCPF_DATA_CHECKSUM,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
+	{ DCCPF_SEND_LEV_RATE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
 };
 #define DCCP_FEAT_SUPPORTED_MAX		ARRAY_SIZE(dccp_feat_table)
 
@@ -99,6 +189,41 @@ static int dccp_feat_default_value(u8 feat_num)
 	return idx < 0 ? : dccp_feat_table[idx].default_value;
 }
 
+static int __dccp_feat_activate(struct sock *sk, const int idx,
+				const bool is_local, dccp_feat_val const *fval)
+{
+	bool rx;
+	u64 val;
+
+	if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
+		return -1;
+	if (dccp_feat_table[idx].activation_hdlr == NULL)
+		return 0;
+
+	if (fval == NULL) {
+		val = dccp_feat_table[idx].default_value;
+	} else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
+		if (fval->sp.vec == NULL) {
+			/*
+			 * This can happen when an empty Confirm is sent
+			 * for an SP (i.e. known) feature. In this case
+			 * we would be using the default anyway.
+			 */
+			DCCP_CRIT("Feature #%d undefined: using default", idx);
+			val = dccp_feat_table[idx].default_value;
+		} else {
+			val = fval->sp.vec[0];
+		}
+	} else {
+		val = fval->nn;
+	}
+
+	/* Location is RX if this is a local-RX or remote-TX feature */
+	rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
+
+	return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
+}
+
 /* Test for "Req'd" feature (RFC 4340, 6.4) */
 static inline int dccp_feat_must_be_understood(u8 feat_num)
 {
@@ -1506,6 +1631,74 @@ out:
 
 EXPORT_SYMBOL_GPL(dccp_feat_init);
 
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_feat_entry *cur, *next;
+	int idx;
+	dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
+		 [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
+	};
+
+	list_for_each_entry(cur, fn_list, node) {
+		/*
+		 * An empty Confirm means that either an unknown feature type
+		 * or an invalid value was present. In the first case there is
+		 * nothing to activate, in the other the default value is used.
+		 */
+		if (cur->empty_confirm)
+			continue;
+
+		idx = dccp_feat_index(cur->feat_num);
+		if (idx < 0) {
+			DCCP_BUG("Unknown feature %u", cur->feat_num);
+			goto activation_failed;
+		}
+		if (cur->state != FEAT_STABLE) {
+			DCCP_CRIT("Negotiation of %s %u failed in state %u",
+				  cur->is_local ? "local" : "remote",
+				  cur->feat_num, cur->state);
+			goto activation_failed;
+		}
+		fvals[idx][cur->is_local] = &cur->val;
+	}
+
+	/*
+	 * Activate in decreasing order of index, so that the CCIDs are always
+	 * activated as the last feature. This avoids the case where a CCID
+	 * relies on the initialisation of one or more features that it depends
+	 * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
+	 */
+	for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
+		if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
+		    __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
+			DCCP_CRIT("Could not activate %d", idx);
+			goto activation_failed;
+		}
+
+	/* Clean up Change options which have been confirmed already */
+	list_for_each_entry_safe(cur, next, fn_list, node)
+		if (!cur->needs_confirm)
+			dccp_feat_list_pop(cur);
+
+	dccp_pr_debug("Activation OK\n");
+	return 0;
+
+activation_failed:
+	/*
+	 * We clean up everything that may have been allocated, since
+	 * it is difficult to track at which stage negotiation failed.
+	 * This is ok, since all allocation functions below are robust
+	 * against NULL arguments.
+	 */
+	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+	dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+	dp->dccps_hc_rx_ackvec = NULL;
+	return -1;
+}
+
 #ifdef CONFIG_IP_DCCP_DEBUG
 const char *dccp_feat_typename(const u8 type)
 {
-- 
cgit v1.2.3


From 3a53a9adfa269da7fa40fc476f09e46155c0143d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Integration of dynamic feature activation - part 1 (socket
 setup)

This first patch out of three replaces the hardcoded default settings with
initialisation code for the dynamic feature negotiation.

Note on retransmitting Confirm options:
---------------------------------------
This patch also defers flushing the client feature-negotiation queue,
due to the following considerations.

As long as the client is in PARTOPEN, it needs to retransmit the Confirm
options for the Change options received on the DCCP-Response from the server.

Otherwise, if the packet containing the Confirm options gets dropped in the
network, the connection aborts due to undefined feature negotiation state.

Thanks to Leandro Melo de Sales who reported a bug in an earlier revision
of the patch set, resulting from not retransmitting the Confirm options.

The patch now ensures that the client feature-negotiation queue is flushed only
when entering the OPEN state. Since confirmed Change options are removed as
soon as they are confirmed (in the DCCP-Response), this ensures that Confirm
options are retransmitted.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/proto.c | 46 ++++++----------------------------------------
 1 file changed, 6 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6550452d59e..0d420790b79 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -67,6 +67,9 @@ void dccp_set_state(struct sock *sk, const int state)
 	case DCCP_OPEN:
 		if (oldstate != DCCP_OPEN)
 			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+		/* Client retransmits all Confirm options until entering OPEN */
+		if (oldstate == DCCP_PARTOPEN)
+			dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
 		break;
 
 	case DCCP_CLOSED:
@@ -175,7 +178,6 @@ EXPORT_SYMBOL_GPL(dccp_state_name);
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	dccp_minisock_init(&dp->dccps_minisock);
@@ -194,45 +196,9 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	dccp_init_xmit_timers(sk);
 
 	INIT_LIST_HEAD(&dp->dccps_featneg);
-	/*
-	 * FIXME: We're hardcoding the CCID, and doing this at this point makes
-	 * the listening (master) sock get CCID control blocks, which is not
-	 * necessary, but for now, to not mess with the test userspace apps,
-	 * lets leave it here, later the real solution is to do this in a
-	 * setsockopt(CCIDs-I-want/accept). -acme
-	 */
-	if (likely(ctl_sock_initialized)) {
-		int rc = dccp_feat_init(sk);
-
-		if (rc)
-			return rc;
-
-		if (dmsk->dccpms_send_ack_vector) {
-			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
-			if (dp->dccps_hc_rx_ackvec == NULL)
-				return -ENOMEM;
-		}
-		dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
-						      sk, GFP_KERNEL);
-		dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
-						      sk, GFP_KERNEL);
-		if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
-			     dp->dccps_hc_tx_ccid == NULL)) {
-			ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-			ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-			if (dmsk->dccpms_send_ack_vector) {
-				dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
-				dp->dccps_hc_rx_ackvec = NULL;
-			}
-			dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
-			return -ENOMEM;
-		}
-	} else {
-		/* control socket doesn't need feat nego */
-		INIT_LIST_HEAD(&dmsk->dccpms_pending);
-		INIT_LIST_HEAD(&dmsk->dccpms_conf);
-	}
-
+	/* control socket doesn't need feat nego */
+	if (likely(ctl_sock_initialized))
+		return dccp_feat_init(sk);
 	return 0;
 }
 
-- 
cgit v1.2.3


From e70cacb90d76f0632f7bba69c87a62e709e84619 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Integration of dynamic feature activation - part 2 (server
 side)

This patch integrates the activation of features at the end of negotiation
into the server-side code.

Note:
  In dccp_create_openreq_child the request_sock argument is no longer constant,
  since dccp_activate_values() uses the feature-negotiation list on dreq to sort
  out the initialisation values for the different features of the child socket;
  and purges this queue after use (but the `req' argument to openreq_child
  can and does still remain constant).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/minisocks.c | 42 ++++++++++++------------------------------
 1 file changed, 12 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index ee7f40f8ddf..258195972bc 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -111,7 +111,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
 
 	if (newsk != NULL) {
-		const struct dccp_request_sock *dreq = dccp_rsk(req);
+		struct dccp_request_sock *dreq = dccp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct dccp_sock *newdp = dccp_sk(newsk);
 		struct dccp_minisock *newdmsk = dccp_msk(newsk);
@@ -125,35 +125,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		newicsk->icsk_rto	    = DCCP_TIMEOUT_INIT;
 
 		INIT_LIST_HEAD(&newdp->dccps_featneg);
-		if (dccp_feat_clone(sk, newsk))
-			goto out_free;
-
-		if (newdmsk->dccpms_send_ack_vector) {
-			newdp->dccps_hc_rx_ackvec =
-						dccp_ackvec_alloc(GFP_ATOMIC);
-			if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
-				goto out_free;
-		}
-
-		newdp->dccps_hc_rx_ccid =
-			    ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
-					   newsk, GFP_ATOMIC);
-		newdp->dccps_hc_tx_ccid =
-			    ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
-					   newsk, GFP_ATOMIC);
-		if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
-			     newdp->dccps_hc_tx_ccid == NULL)) {
-			dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
-			ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
-			ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
-out_free:
-			/* It is still raw copy of parent, so invalidate
-			 * destructor and make plain sk_free() */
-			newsk->sk_destruct = NULL;
-			sk_free(newsk);
-			return NULL;
-		}
-
 		/*
 		 * Step 3: Process LISTEN state
 		 *
@@ -184,6 +155,17 @@ out_free:
 		dccp_set_seqno(&newdp->dccps_awl,
 			       max48(newdp->dccps_awl, newdp->dccps_iss));
 
+		/*
+		 * Activate features after initialising the sequence numbers,
+		 * since CCID initialisation may depend on GSS, ISR, ISS etc.
+		 */
+		if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			sk_free(newsk);
+			return NULL;
+		}
 		dccp_init_xmit_timers(newsk);
 
 		DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
-- 
cgit v1.2.3


From c49b22729f3da7479c4e6c572d53fdd40201d0bd Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Integration of dynamic feature activation - part 3 (client
 side)

This integrates feature-activation in the client, with these details:

 1. When dccp_parse_options() fails, the reset code is already set, request_sent
    _state_process() currently overrides this with `Packet Error', which is not
    intended - so changed to use the reset code set in dccp_parse_options();

 2. There was a FIXME to change the error code when dccp_ackvec_add() fails.
    I have looked this up and found that:
    * the check whether ackno < ISN is already made earlier,
    * this Response is likely the 1st packet with an Ackno that the client gets,
    * so when dccp_ackvec_add() fails, the reason is likely not a packet error.

 3. When feature negotiation fails, the socket should be marked as not usable,
    so that the application is notified that an error occurs. This is achieved
    by a new label, which uses an error code of `Aborted' and which sets the
    socket state to CLOSED, as well as sk_err.

 4. Avoids parsing the Ack twice in Respond state by not doing option processing
    again in dccp_rcv_respond_partopen_state_process (as option processing has
    already been done on the request_sock in dccp_check_req).

Since this addresses congestion-control initialisation, a corresponding
FIXME has been removed.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/input.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 3070015edc7..0672b7e1763 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -421,8 +421,13 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			goto out_invalid_packet;
 		}
 
+		/*
+		 * If option processing (Step 8) failed, return 1 here so that
+		 * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
+		 * the option type and is set in dccp_parse_options().
+		 */
 		if (dccp_parse_options(sk, NULL, skb))
-			goto out_invalid_packet;
+			return 1;
 
 		/* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
 		if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
@@ -475,6 +480,15 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		 */
 		dccp_set_state(sk, DCCP_PARTOPEN);
 
+		/*
+		 * If feature negotiation was successful, activate features now;
+		 * an activation failure means that this host could not activate
+		 * one ore more features (e.g. insufficient memory), which would
+		 * leave at least one feature in an undefined state.
+		 */
+		if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
+			goto unable_to_proceed;
+
 		/* Make sure socket is routed, for correct metrics. */
 		icsk->icsk_af_ops->rebuild_header(sk);
 
@@ -509,6 +523,16 @@ out_invalid_packet:
 	/* dccp_v4_do_rcv will send a reset */
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
 	return 1;
+
+unable_to_proceed:
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
+	/*
+	 * We mark this socket as no longer usable, so that the loop in
+	 * dccp_sendmsg() terminates and the application gets notified.
+	 */
+	dccp_set_state(sk, DCCP_CLOSED);
+	sk->sk_err = ECOMM;
+	return 1;
 }
 
 static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
@@ -600,7 +624,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		return 1;
 	}
 
-	if (sk->sk_state != DCCP_REQUESTING) {
+	if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) {
 		if (dccp_check_seqno(sk, skb))
 			goto discard;
 
@@ -665,8 +689,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		return 1;
 
 	case DCCP_REQUESTING:
-		/* FIXME: do congestion control initialization */
-
 		queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
 		if (queued >= 0)
 			return queued;
-- 
cgit v1.2.3


From 23479cbfd30402c7d9fa413cc467983061073557 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Clean up old feature-negotiation infrastructure

The code removed by this patch is no longer referenced or used, the added
lines update documentation and copyrights.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c | 505 +-------------------------------------------------------
 net/dccp/feat.h |  12 +-
 2 files changed, 12 insertions(+), 505 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index f78bd357b5b..6c82dea409d 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1,8 +1,13 @@
 /*
  *  net/dccp/feat.c
  *
- *  An implementation of the DCCP protocol
- *  Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ *
+ *  Copyright (c) 2008 The University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *  Rewrote from scratch, some bits from earlier code by
+ *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
  *
  *  ASSUMPTIONS
  *  -----------
@@ -17,14 +22,10 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  */
-
 #include <linux/module.h>
-
 #include "ccid.h"
 #include "feat.h"
 
-#define DCCP_FEAT_SP_NOAGREE (-123)
-
 /*
  * Feature activation handlers.
  *
@@ -805,51 +806,6 @@ int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
 	return 0;
 }
 
-static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
-	/* figure out if we are changing our CCID or the peer's */
-	const int rx = type == DCCPO_CHANGE_R;
-	const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
-	struct ccid *new_ccid;
-
-	/* Check if nothing is being changed. */
-	if (ccid_nr == new_ccid_nr)
-		return 0;
-
-	new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC);
-	if (new_ccid == NULL)
-		return -ENOMEM;
-
-	if (rx) {
-		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-		dp->dccps_hc_rx_ccid = new_ccid;
-		dmsk->dccpms_rx_ccid = new_ccid_nr;
-	} else {
-		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-		dp->dccps_hc_tx_ccid = new_ccid;
-		dmsk->dccpms_tx_ccid = new_ccid_nr;
-	}
-
-	return 0;
-}
-
-static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
-{
-	dccp_feat_debug(type, feat, val);
-
-	switch (feat) {
-	case DCCPF_CCID:
-		return dccp_feat_update_ccid(sk, type, val);
-	default:
-		dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n",
-			      dccp_feat_typename(type), feat);
-		break;
-	}
-	return 0;
-}
-
 /* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
 static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
 {
@@ -919,453 +875,6 @@ static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
 	return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
 }
 
-#ifdef __this_is_the_old_framework_and_will_be_removed_later_in_a_subsequent_patch
-static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
-			       u8 *rpref, u8 rlen)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-	u8 *spref, slen, *res = NULL;
-	int i, j, rc, agree = 1;
-
-	BUG_ON(rpref == NULL);
-
-	/* check if we are the black sheep */
-	if (dp->dccps_role == DCCP_ROLE_CLIENT) {
-		spref = rpref;
-		slen  = rlen;
-		rpref = opt->dccpop_val;
-		rlen  = opt->dccpop_len;
-	} else {
-		spref = opt->dccpop_val;
-		slen  = opt->dccpop_len;
-	}
-	/*
-	 * Now we have server preference list in spref and client preference in
-	 * rpref
-	 */
-	BUG_ON(spref == NULL);
-	BUG_ON(rpref == NULL);
-
-	/* FIXME sanity check vals */
-
-	/* Are values in any order?  XXX Lame "algorithm" here */
-	for (i = 0; i < slen; i++) {
-		for (j = 0; j < rlen; j++) {
-			if (spref[i] == rpref[j]) {
-				res = &spref[i];
-				break;
-			}
-		}
-		if (res)
-			break;
-	}
-
-	/* we didn't agree on anything */
-	if (res == NULL) {
-		/* confirm previous value */
-		switch (opt->dccpop_feat) {
-		case DCCPF_CCID:
-			/* XXX did i get this right? =P */
-			if (opt->dccpop_type == DCCPO_CHANGE_L)
-				res = &dccp_msk(sk)->dccpms_tx_ccid;
-			else
-				res = &dccp_msk(sk)->dccpms_rx_ccid;
-			break;
-
-		default:
-			DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat);
-			/* XXX implement res */
-			return -EFAULT;
-		}
-
-		dccp_pr_debug("Don't agree... reconfirming %d\n", *res);
-		agree = 0; /* this is used for mandatory options... */
-	}
-
-	/* need to put result and our preference list */
-	rlen = 1 + opt->dccpop_len;
-	rpref = kmalloc(rlen, GFP_ATOMIC);
-	if (rpref == NULL)
-		return -ENOMEM;
-
-	*rpref = *res;
-	memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len);
-
-	/* put it in the "confirm queue" */
-	if (opt->dccpop_sc == NULL) {
-		opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC);
-		if (opt->dccpop_sc == NULL) {
-			kfree(rpref);
-			return -ENOMEM;
-		}
-	} else {
-		/* recycle the confirm slot */
-		BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
-		kfree(opt->dccpop_sc->dccpoc_val);
-		dccp_pr_debug("recycling confirm slot\n");
-	}
-	memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
-
-	opt->dccpop_sc->dccpoc_val = rpref;
-	opt->dccpop_sc->dccpoc_len = rlen;
-
-	/* update the option on our side [we are about to send the confirm] */
-	rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
-	if (rc) {
-		kfree(opt->dccpop_sc->dccpoc_val);
-		kfree(opt->dccpop_sc);
-		opt->dccpop_sc = NULL;
-		return rc;
-	}
-
-	dccp_pr_debug("Will confirm %d\n", *rpref);
-
-	/* say we want to change to X but we just got a confirm X, suppress our
-	 * change
-	 */
-	if (!opt->dccpop_conf) {
-		if (*opt->dccpop_val == *res)
-			opt->dccpop_conf = 1;
-		dccp_pr_debug("won't ask for change of same feature\n");
-	}
-
-	return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */
-}
-
-static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
-{
-	struct dccp_minisock *dmsk = dccp_msk(sk);
-	struct dccp_opt_pend *opt;
-	int rc = 1;
-	u8 t;
-
-	/*
-	 * We received a CHANGE.  We gotta match it against our own preference
-	 * list.  If we got a CHANGE_R it means it's a change for us, so we need
-	 * to compare our CHANGE_L list.
-	 */
-	if (type == DCCPO_CHANGE_L)
-		t = DCCPO_CHANGE_R;
-	else
-		t = DCCPO_CHANGE_L;
-
-	/* find our preference list for this feature */
-	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
-		if (opt->dccpop_type != t || opt->dccpop_feat != feature)
-			continue;
-
-		/* find the winner from the two preference lists */
-		rc = dccp_feat_reconcile(sk, opt, val, len);
-		break;
-	}
-
-	/* We didn't deal with the change.  This can happen if we have no
-	 * preference list for the feature.  In fact, it just shouldn't
-	 * happen---if we understand a feature, we should have a preference list
-	 * with at least the default value.
-	 */
-	BUG_ON(rc == 1);
-
-	return rc;
-}
-
-static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
-{
-	struct dccp_opt_pend *opt;
-	struct dccp_minisock *dmsk = dccp_msk(sk);
-	u8 *copy;
-	int rc;
-
-	/* NN features must be Change L (sec. 6.3.2) */
-	if (type != DCCPO_CHANGE_L) {
-		dccp_pr_debug("received %s for NN feature %d\n",
-				dccp_feat_typename(type), feature);
-		return -EFAULT;
-	}
-
-	/* XXX sanity check opt val */
-
-	/* copy option so we can confirm it */
-	opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
-	if (opt == NULL)
-		return -ENOMEM;
-
-	copy = kmemdup(val, len, GFP_ATOMIC);
-	if (copy == NULL) {
-		kfree(opt);
-		return -ENOMEM;
-	}
-
-	opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */
-	opt->dccpop_feat = feature;
-	opt->dccpop_val	 = copy;
-	opt->dccpop_len	 = len;
-
-	/* change feature */
-	rc = dccp_feat_update(sk, type, feature, *val);
-	if (rc) {
-		kfree(opt->dccpop_val);
-		kfree(opt);
-		return rc;
-	}
-
-	dccp_feat_debug(type, feature, *copy);
-
-	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
-
-	return 0;
-}
-
-static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
-				    u8 type, u8 feature)
-{
-	/* XXX check if other confirms for that are queued and recycle slot */
-	struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
-
-	if (opt == NULL) {
-		/* XXX what do we do?  Ignoring should be fine.  It's a change
-		 * after all =P
-		 */
-		return;
-	}
-
-	switch (type) {
-	case DCCPO_CHANGE_L:
-		opt->dccpop_type = DCCPO_CONFIRM_R;
-		break;
-	case DCCPO_CHANGE_R:
-		opt->dccpop_type = DCCPO_CONFIRM_L;
-		break;
-	default:
-		DCCP_WARN("invalid type %d\n", type);
-		kfree(opt);
-		return;
-	}
-	opt->dccpop_feat = feature;
-	opt->dccpop_val	 = NULL;
-	opt->dccpop_len	 = 0;
-
-	/* change feature */
-	dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature);
-
-	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
-}
-
-static void dccp_feat_flush_confirm(struct sock *sk)
-{
-	struct dccp_minisock *dmsk = dccp_msk(sk);
-	/* Check if there is anything to confirm in the first place */
-	int yes = !list_empty(&dmsk->dccpms_conf);
-
-	if (!yes) {
-		struct dccp_opt_pend *opt;
-
-		list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
-			if (opt->dccpop_conf) {
-				yes = 1;
-				break;
-			}
-		}
-	}
-
-	if (!yes)
-		return;
-
-	/* OK there is something to confirm... */
-	/* XXX check if packet is in flight?  Send delayed ack?? */
-	if (sk->sk_state == DCCP_OPEN)
-		dccp_send_ack(sk);
-}
-
-int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
-{
-	int rc;
-
-	/* Ignore Change requests other than during connection setup */
-	if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING)
-		return 0;
-	dccp_feat_debug(type, feature, *val);
-
-	/* figure out if it's SP or NN feature */
-	switch (feature) {
-	/* deal with SP features */
-	case DCCPF_CCID:
-		/* XXX Obsoleted by next patch
-		rc = dccp_feat_sp(sk, type, feature, val, len); */
-		break;
-
-	/* deal with NN features */
-	case DCCPF_ACK_RATIO:
-		/* XXX Obsoleted by next patch
-		rc = dccp_feat_nn(sk, type, feature, val, len); */
-		break;
-
-	/* XXX implement other features */
-	default:
-		dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n",
-			      dccp_feat_typename(type), feature);
-		rc = -EFAULT;
-		break;
-	}
-
-	/* check if there were problems changing features */
-	if (rc) {
-		/* If we don't agree on SP, we sent a confirm for old value.
-		 * However we propagate rc to caller in case option was
-		 * mandatory
-		 */
-		if (rc != DCCP_FEAT_SP_NOAGREE)
-			dccp_feat_empty_confirm(dccp_msk(sk), type, feature);
-	}
-
-	/* generate the confirm [if required] */
-	dccp_feat_flush_confirm(sk);
-
-	return rc;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
-
-int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
-			   u8 *val, u8 len)
-{
-	u8 t;
-	struct dccp_opt_pend *opt;
-	struct dccp_minisock *dmsk = dccp_msk(sk);
-	int found = 0;
-	int all_confirmed = 1;
-
-	/* Ignore Confirm options other than during connection setup */
-	if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING)
-		return 0;
-	dccp_feat_debug(type, feature, *val);
-
-	/* locate our change request */
-	switch (type) {
-	case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break;
-	case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break;
-	default:	      DCCP_WARN("invalid type %d\n", type);
-			      return 1;
-
-	}
-	/* XXX sanity check feature value */
-
-	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
-		if (!opt->dccpop_conf && opt->dccpop_type == t &&
-		    opt->dccpop_feat == feature) {
-			found = 1;
-			dccp_pr_debug("feature %d found\n", opt->dccpop_feat);
-
-			/* XXX do sanity check */
-
-			opt->dccpop_conf = 1;
-
-			/* We got a confirmation---change the option */
-			dccp_feat_update(sk, opt->dccpop_type,
-					 opt->dccpop_feat, *val);
-
-			/* XXX check the return value of dccp_feat_update */
-			break;
-		}
-
-		if (!opt->dccpop_conf)
-			all_confirmed = 0;
-	}
-
-	if (!found)
-		dccp_pr_debug("%s(%d, ...) never requested\n",
-			      dccp_feat_typename(type), feature);
-	return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
-#endif	/* (later) */
-
-void dccp_feat_clean(struct dccp_minisock *dmsk)
-{
-	struct dccp_opt_pend *opt, *next;
-
-	list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending,
-				 dccpop_node) {
-		BUG_ON(opt->dccpop_val == NULL);
-		kfree(opt->dccpop_val);
-
-		if (opt->dccpop_sc != NULL) {
-			BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
-			kfree(opt->dccpop_sc->dccpoc_val);
-			kfree(opt->dccpop_sc);
-		}
-
-		kfree(opt);
-	}
-	INIT_LIST_HEAD(&dmsk->dccpms_pending);
-
-	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
-		BUG_ON(opt == NULL);
-		if (opt->dccpop_val != NULL)
-			kfree(opt->dccpop_val);
-		kfree(opt);
-	}
-	INIT_LIST_HEAD(&dmsk->dccpms_conf);
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_clean);
-
-/* this is to be called only when a listening sock creates its child.  It is
- * assumed by the function---the confirm is not duplicated, but rather it is
- * "passed on".
- */
-int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
-{
-	struct dccp_minisock *olddmsk = dccp_msk(oldsk);
-	struct dccp_minisock *newdmsk = dccp_msk(newsk);
-	struct dccp_opt_pend *opt;
-	int rc = 0;
-
-	INIT_LIST_HEAD(&newdmsk->dccpms_pending);
-	INIT_LIST_HEAD(&newdmsk->dccpms_conf);
-
-	list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) {
-		struct dccp_opt_pend *newopt;
-		/* copy the value of the option */
-		u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC);
-
-		if (val == NULL)
-			goto out_clean;
-
-		newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC);
-		if (newopt == NULL) {
-			kfree(val);
-			goto out_clean;
-		}
-
-		/* insert the option */
-		newopt->dccpop_val = val;
-		list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
-
-		/* XXX what happens with backlogs and multiple connections at
-		 * once...
-		 */
-		/* the master socket no longer needs to worry about confirms */
-		opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */
-
-		/* reset state for a new socket */
-		opt->dccpop_conf = 0;
-	}
-
-	/* XXX not doing anything about the conf queue */
-
-out:
-	return rc;
-
-out_clean:
-	dccp_feat_clean(newdmsk);
-	rc = -ENOMEM;
-	goto out;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_clone);
-
 /**
  * dccp_feat_change_recv  -  Process incoming ChangeL/R options
  * @fn: feature-negotiation list to update
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 618bed935b3..177e9c39ea9 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -3,14 +3,14 @@
 /*
  *  net/dccp/feat.h
  *
- *  An implementation of the DCCP protocol
+ *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
  *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
  *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License version 2 as
- *	published by the Free Software Foundation.
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
  */
-
 #include <linux/types.h>
 #include "dccp.h"
 
@@ -117,8 +117,6 @@ extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
 				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
-extern void dccp_feat_clean(struct dccp_minisock *dmsk);
-extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
 extern int  dccp_feat_init(struct sock *sk);
 
-- 
cgit v1.2.3


From 78673e24df27c76ec75565f4024d45c2c74ef148 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Remove obsolete parts of the old CCID interface

The TX/RX CCIDs of the minisock are now redundant: similar to the Ack Vector
case, their value equals initially that of the sysctl, but at the end of
feature negotiation may be something different.

The old interface removed by this patch thus has been replaced by the newer
interface to dynamically query the currently loaded CCIDs earlier in this
patch set.

Also removed the constructors for the TX CCID and the RX CCID, since the
switch rx/non-rx is done by the handler in minisocks.c (and the handler is
the only place in the code where CCIDs are loaded).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/ccid.c      | 14 --------------
 net/dccp/ccid.h      |  5 -----
 net/dccp/feat.c      | 12 ------------
 net/dccp/minisocks.c |  2 --
 4 files changed, 33 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index f72ca83df55..330372a1a0b 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -253,20 +253,6 @@ out_module_put:
 
 EXPORT_SYMBOL_GPL(ccid_new);
 
-struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
-{
-	return ccid_new(id, sk, 1, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
-
-struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk,  gfp_t gfp)
-{
-	return ccid_new(id, sk, 0, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
-
 static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
 {
 	struct ccid_operations *ccid_ops;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 803343aed00..18f69423a70 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -111,11 +111,6 @@ extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
-extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
-				   gfp_t gfp);
-extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
-				   gfp_t gfp);
-
 static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
 {
 	struct ccid *ccid = dp->dccps_hc_rx_ccid;
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 6c82dea409d..cb2ddd2b894 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1119,18 +1119,6 @@ int dccp_feat_init(struct sock *sk)
 	INIT_LIST_HEAD(&dmsk->dccpms_pending);	/* XXX no longer used */
 	INIT_LIST_HEAD(&dmsk->dccpms_conf);	/* XXX no longer used */
 
-	/* CCID L */
-	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0,
-				&dmsk->dccpms_tx_ccid, 1);
-	if (rc)
-		goto out;
-
-	/* CCID R */
-	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0,
-				&dmsk->dccpms_rx_ccid, 1);
-	if (rc)
-		goto out;
-
 	/* Ack ratio */
 	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
 				dp->dccps_l_ack_ratio);
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 258195972bc..486d61df260 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -45,8 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row);
 void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
-	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
-- 
cgit v1.2.3


From 68e074bfcef269bc61006c2740d7f89ccbbd93d7 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Remove manual influence on NDP Count feature

Updating the NDP count feature is handled automatically now:
 * for CCID-2 it is disabled, since the code does not use NDP counts;
 * for CCID-3 it is enabled, as NDP counts are used to determine loss lengths.

Allowing the user to change NDP values leads to unpredictable and failing
behaviour, since it is then possible to disable NDP counts even when they
are needed (e.g. in CCID-3).

This means that only those user settings are sensible that agree with the
values for Send NDP Count implied by the choice of CCID. But those settings
are already activated by the feature negotiation (CCID dependency tracking),
hence this form of support is redundant.

At startup the initialisation of the NDP count feature is with the default
value of 0, which is done implicitly by the zeroing-out of the socket when
it is allocated. If the choice of CCID or feature negotiation enables NDP
count, this will then be updated via the NDP activation handler.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h      | 1 -
 net/dccp/feat.c      | 2 +-
 net/dccp/minisocks.c | 1 -
 net/dccp/options.c   | 4 +---
 net/dccp/sysctl.c    | 7 -------
 5 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 1baed789bcc..51436c82565 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -99,7 +99,6 @@ extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
 extern int  sysctl_dccp_feat_send_ack_vector;
-extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index cb2ddd2b894..35a57ab3bb1 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -86,7 +86,7 @@ static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
 static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
 {
 	if (!rx)
-		dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0);
+		dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
 	return 0;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 486d61df260..9e223257266 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -46,7 +46,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
-	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 3a9a22f0ac1..6b0704497e8 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -27,7 +27,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
-int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
@@ -531,8 +530,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dmsk->dccpms_send_ndp_count &&
-	    dccp_insert_option_ndp(sk, skb))
+	if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
 		return -1;
 
 	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index f6e54f433e2..587c12f915c 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -47,13 +47,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "send_ndp",
-		.data		= &sysctl_dccp_feat_send_ndp_count,
-		.maxlen		= sizeof(sysctl_dccp_feat_send_ndp_count),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
-- 
cgit v1.2.3


From b235dc4abbc1356284bd0dc730efa711f394e0e2 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Phase out the use of boolean Ack Vector sysctl

This removes the use of the sysctl and the minisock variable for the Send Ack
Vector feature, which is now handled fully dynamically via feature negotiation;
i.e. when CCID2 is enabled, Ack Vectors are automatically enabled (as per
RFC 4341, 4.).

Using a sysctl in parallel to this implementation would open the door to
crashes, since much of the code relies on tests of the boolean minisock /
sysctl variable. Thus, this patch replaces all tests of type

	if (dccp_msk(sk)->dccpms_send_ack_vector)
		/* ... */
with
	if (dp->dccps_hc_rx_ackvec != NULL)
		/* ... */

The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature
negotiation concluded that Ack Vectors are to be used on the half-connection.
Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child),
so that the test is a valid one.

The activation handler for Ack Vectors is called as soon as the feature
negotiation has concluded at the
 * server when the Ack marking the transition RESPOND => OPEN arrives;
 * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN.

Adding the sequence number of the Response packet to the Ack Vector has been
removed, since
 (a) connection establishment implies that the Response has been received;
 (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e.
     this entry will always be ignored;
 (c) it can not be used for anything useful - to detect loss for instance, only
     packets received after the loss can serve as pseudo-dupacks.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h      |  3 +--
 net/dccp/diag.c      |  2 +-
 net/dccp/input.c     | 12 +++---------
 net/dccp/minisocks.c |  1 -
 net/dccp/options.c   |  7 ++-----
 net/dccp/proto.c     |  3 +--
 net/dccp/sysctl.c    |  7 -------
 7 files changed, 8 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 51436c82565..3fd16e82c00 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -98,7 +98,6 @@ extern int  sysctl_dccp_retries2;
 extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
-extern int  sysctl_dccp_feat_send_ack_vector;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
@@ -434,7 +433,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
 	const struct dccp_sock *dp = dccp_sk(sk);
 	return dp->dccps_timestamp_echo != 0 ||
 #ifdef CONFIG_IP_DCCP_ACKVEC
-	       (dccp_msk(sk)->dccpms_send_ack_vector &&
+	       (dp->dccps_hc_rx_ackvec != NULL &&
 		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
 #endif
 	       inet_csk_ack_scheduled(sk);
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index d8a3509b26f..93aae7c9555 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_backoff	= icsk->icsk_backoff;
 	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector)
+	if (dp->dccps_hc_rx_ackvec != NULL)
 		info->tcpi_options |= TCPI_OPT_SACK;
 
 	ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 0672b7e1763..5eb443f656c 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector)
+	if (dp->dccps_hc_rx_ackvec != NULL)
 		dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
 					    DCCP_SKB_CB(skb)->dccpd_ack_seq);
 }
@@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 		dccp_event_ack_recv(sk, skb);
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector &&
+	if (dp->dccps_hc_rx_ackvec != NULL &&
 	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
 			    DCCP_SKB_CB(skb)->dccpd_seq,
 			    DCCP_ACKVEC_STATE_RECEIVED))
@@ -434,12 +434,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
 			    dp->dccps_options_received.dccpor_timestamp_echo));
 
-		if (dccp_msk(sk)->dccpms_send_ack_vector &&
-		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-				    DCCP_SKB_CB(skb)->dccpd_seq,
-				    DCCP_ACKVEC_STATE_RECEIVED))
-			goto out_invalid_packet; /* FIXME: change error code */
-
 		/* Stop the REQUEST timer */
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 		WARN_ON(sk->sk_send_head == NULL);
@@ -637,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 			dccp_event_ack_recv(sk, skb);
 
-		if (dccp_msk(sk)->dccpms_send_ack_vector &&
+		if (dp->dccps_hc_rx_ackvec != NULL &&
 		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
 				    DCCP_SKB_CB(skb)->dccpd_seq,
 				    DCCP_ACKVEC_STATE_RECEIVED))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 9e223257266..0ebf8ebcf3d 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -45,7 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row);
 void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 }
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 6b0704497e8..aca309e1663 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -26,7 +26,6 @@
 int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
@@ -145,8 +144,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		case DCCPO_ACK_VECTOR_1:
 			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
 				break;
-
-			if (dccp_msk(sk)->dccpms_send_ack_vector &&
+			if (dp->dccps_hc_rx_ackvec != NULL &&
 			    dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
 				goto out_invalid_option;
 			break;
@@ -526,7 +524,6 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
 int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
@@ -547,7 +544,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 			if (dccp_insert_option_timestamp(sk, skb))
 				return -1;
 
-		} else if (dmsk->dccpms_send_ack_vector	&&
+		} else if (dp->dccps_hc_rx_ackvec != NULL &&
 			   dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
 			   dccp_insert_option_ackvec(sk, skb)) {
 				return -1;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0d420790b79..775eaa3d0c4 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -207,7 +207,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock);
 void dccp_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	/*
 	 * DCCP doesn't use sk_write_queue, just sk_send_head
@@ -225,7 +224,7 @@ void dccp_destroy_sock(struct sock *sk)
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
 
-	if (dmsk->dccpms_send_ack_vector) {
+	if (dp->dccps_hc_rx_ackvec != NULL) {
 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 587c12f915c..018e210875e 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "send_ackvec",
-		.data		= &sysctl_dccp_feat_send_ack_vector,
-		.maxlen		= sizeof(sysctl_dccp_feat_send_ack_vector),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
-- 
cgit v1.2.3


From 5d3dac267a7fd0811ec777e76a81f97f5cdcb395 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Initialisation framework for feature negotiation

This initialises feature negotiation from two tables, which are initialised
from sysctls.

As a novel feature, specifics of the implementation (e.g. currently short
seqnos and ECN are not supported) are advertised for robustness.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 net/dccp/feat.h |  2 +-
 2 files changed, 57 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 35a57ab3bb1..a687740e442 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1110,24 +1110,70 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	return 0;	/* ignore FN options in all other states */
 }
 
+/**
+ * dccp_feat_init  -  Seed feature negotiation with host-specific defaults
+ * This initialises global defaults, depending on the value of the sysctls.
+ * These can later be overridden by registering changes via setsockopt calls.
+ * The last link in the chain is finalise_settings, to make sure that between
+ * here and the start of actual feature negotiation no inconsistencies enter.
+ *
+ * All features not appearing below use either defaults or are otherwise
+ * later adjusted through dccp_feat_finalise_settings().
+ */
 int dccp_feat_init(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
+	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+	u8 on = 1, off = 0;
 	int rc;
+	struct {
+		u8 *val;
+		u8 len;
+	} tx, rx;
+
+	/* Non-negotiable (NN) features */
+	rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
+				    sysctl_dccp_feat_sequence_window);
+	if (rc)
+		return rc;
+
+	/* Server-priority (SP) features */
+
+	/* Advertise that short seqnos are not supported (7.6.1) */
+	rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
+	if (rc)
+		return rc;
 
-	INIT_LIST_HEAD(&dmsk->dccpms_pending);	/* XXX no longer used */
-	INIT_LIST_HEAD(&dmsk->dccpms_conf);	/* XXX no longer used */
+	/* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
+	rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+	if (rc)
+		return rc;
+
+	/*
+	 * We advertise the available list of CCIDs and reorder according to
+	 * preferences, to avoid failure resulting from negotiating different
+	 * singleton values (which always leads to failure).
+	 * These settings can still (later) be overridden via sockopts.
+	 */
+	if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
+	    ccid_get_builtin_ccids(&rx.val, &rx.len))
+		return -ENOBUFS;
 
-	/* Ack ratio */
-	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
-				dp->dccps_l_ack_ratio);
-out:
+	if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) ||
+	    !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len))
+		goto free_ccid_lists;
+
+	rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
+	if (rc)
+		goto free_ccid_lists;
+
+	rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
+
+free_ccid_lists:
+	kfree(tx.val);
+	kfree(rx.val);
 	return rc;
 }
 
-EXPORT_SYMBOL_GPL(dccp_feat_init);
-
 int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 177e9c39ea9..f73b47abca9 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -112,13 +112,13 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 #define dccp_feat_debug(type, feat, val)
 #endif /* CONFIG_IP_DCCP_DEBUG */
 
+extern int  dccp_feat_init(struct sock *sk);
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  u8 const *list, u8 len);
 extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
 				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
-extern int  dccp_feat_init(struct sock *sk);
 
 /*
  * Encoding variable-length options and their maximum length.
-- 
cgit v1.2.3


From 09856c108956c99088ead9267ccbd1dab77f7043 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Auto-load (when supported) CCID plugins for negotiation

This adds auto-loading of CCIDs (when module loading is enabled)
for the purpose of feature negotiation.

The problem with loading the CCIDs at the end of feature negotiation is
that this would happen in software interrupt context. Besides, if the host
advertises CCIDs during negotiation, it should have them ready to use, in
case an agreeing peer wants to use it for the connection.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/ccid.c | 39 +++++++++++++++++++++++++++++----------
 net/dccp/ccid.h |  1 +
 net/dccp/feat.c |  5 +++++
 3 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 330372a1a0b..e3fb52b4f5c 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -196,22 +196,41 @@ int ccid_unregister(struct ccid_operations *ccid_ops)
 
 EXPORT_SYMBOL_GPL(ccid_unregister);
 
+/**
+ * ccid_request_module  -  Pre-load CCID module for later use
+ * This should be called only from process context (e.g. during connection
+ * setup) and is necessary for later calls to ccid_new (typically in software
+ * interrupt), so that it has the modules available when they are needed.
+ */
+static int ccid_request_module(u8 id)
+{
+	if (!in_atomic()) {
+		ccids_read_lock();
+		if (ccids[id] == NULL) {
+			ccids_read_unlock();
+			return request_module("net-dccp-ccid-%d", id);
+		}
+		ccids_read_unlock();
+	}
+	return 0;
+}
+
+int ccid_request_modules(u8 const *ccid_array, u8 array_len)
+{
+#ifdef CONFIG_KMOD
+	while (array_len--)
+		if (ccid_request_module(ccid_array[array_len]))
+			return -1;
+#endif
+	return 0;
+}
+
 struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
 {
 	struct ccid_operations *ccid_ops;
 	struct ccid *ccid = NULL;
 
 	ccids_read_lock();
-#ifdef CONFIG_KMOD
-	if (ccids[id] == NULL) {
-		/* We only try to load if in process context */
-		ccids_read_unlock();
-		if (gfp & GFP_ATOMIC)
-			goto out;
-		request_module("net-dccp-ccid-%d", id);
-		ccids_read_lock();
-	}
-#endif
 	ccid_ops = ccids[id];
 	if (ccid_ops == NULL)
 		goto out_unlock;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 18f69423a70..20ba066b277 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -108,6 +108,7 @@ extern int  ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
 extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
 					  char __user *, int __user *);
 
+extern int    ccid_request_modules(u8 const *ccid_array, u8 array_len);
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index a687740e442..9a493809278 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1158,6 +1158,11 @@ int dccp_feat_init(struct sock *sk)
 	    ccid_get_builtin_ccids(&rx.val, &rx.len))
 		return -ENOBUFS;
 
+	/* Pre-load all CCID modules that are going to be advertised */
+	rc = -EUNATCH;
+	if (ccid_request_modules(tx.val, tx.len))
+		goto free_ccid_lists;
+
 	if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) ||
 	    !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len))
 		goto free_ccid_lists;
-- 
cgit v1.2.3


From 51c7d4fa2675c106a980ddcdbe308b54b5151945 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Implement both feature-local and feature-remote Sequence Window
 feature

This adds full support for local/remote Sequence Window feature, from which the
  * sequence-number-validity (W) and
  * acknowledgment-number-validity (W') windows
derive as specified in RFC 4340, 7.5.3.

Specifically, the following changes are introduced:
  * integrated new socket fields into dccp_sk;
  * updated the update_gsr/gss routines with regard to these fields;
  * updated handler code: the Sequence Window feature is located at the TX side,
    so the local feature is meant if the handler-rx flag is false;
  * the initialisation of `rcv_wnd' in reqsk is removed, since
    - rcv_wnd is not used by the code anywhere;
    - sequence number checks are not done in the LISTEN state (cf. 7.5.3);
    - dccp_check_req checks the Ack number validity more rigorously;
  * the `struct dccp_minisock' became empty and is now removed.

Until the handshake completes with activating negotiated values, the local/remote
Sequence-Window values are undefined and thus can not reliably be estimated.
This issue is addressed in a separate patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h      | 16 +++++++---------
 net/dccp/feat.c      | 13 +++++++++++--
 net/dccp/minisocks.c | 11 -----------
 net/dccp/proto.c     |  2 --
 4 files changed, 18 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 3fd16e82c00..6101ecdb85b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -409,23 +409,21 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
 static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	const struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	dp->dccps_gsr = seq;
-	dccp_set_seqno(&dp->dccps_swl,
-		       dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
-	dccp_set_seqno(&dp->dccps_swh,
-		       dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
+	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
+	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
 }
 
 static inline void dccp_update_gss(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	dp->dccps_awh = dp->dccps_gss = seq;
-	dccp_set_seqno(&dp->dccps_awl,
-		       (dp->dccps_gss -
-			dccp_msk(sk)->dccpms_sequence_window + 1));
+	dp->dccps_gss = seq;
+	/* Ack validity window depends on local Sequence Window value (7.5.1) */
+	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+	dp->dccps_awh = dp->dccps_gss;
 }
 
 static inline int dccp_ack_pending(const struct sock *sk)
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 9a493809278..84346599985 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -52,8 +52,17 @@ static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
 
 static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
 {
-	if (!rx)
-		dccp_msk(sk)->dccpms_sequence_window = seq_win;
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx) {
+		dp->dccps_r_seq_win = seq_win;
+		/* propagate changes to update SWL/SWH */
+		dccp_update_gsr(sk, dp->dccps_gsr);
+	} else {
+		dp->dccps_l_seq_win = seq_win;
+		/* propagate changes to update AWL */
+		dccp_update_gss(sk, dp->dccps_gss);
+	}
 	return 0;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 0ebf8ebcf3d..0ecb19c5e8c 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -42,11 +42,6 @@ struct inet_timewait_death_row dccp_death_row = {
 
 EXPORT_SYMBOL_GPL(dccp_death_row);
 
-void dccp_minisock_init(struct dccp_minisock *dmsk)
-{
-	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-}
-
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
@@ -110,7 +105,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		struct dccp_request_sock *dreq = dccp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct dccp_sock *newdp = dccp_sk(newsk);
-		struct dccp_minisock *newdmsk = dccp_msk(newsk);
 
 		newdp->dccps_role	    = DCCP_ROLE_SERVER;
 		newdp->dccps_hc_rx_ackvec   = NULL;
@@ -128,10 +122,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		 *    Initialize S.GAR := S.ISS
 		 *    Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
 		 */
-
-		/* See dccp_v4_conn_request */
-		newdmsk->dccpms_sequence_window = req->rcv_wnd;
-
 		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
 		dccp_update_gss(newsk, dreq->dreq_iss);
 
@@ -289,7 +279,6 @@ int dccp_reqsk_init(struct request_sock *req,
 
 	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
 	inet_rsk(req)->acked	  = 0;
-	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
 
 	/* inherit feature negotiation options from listening socket */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 775eaa3d0c4..392a5d822b3 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -180,8 +180,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	dccp_minisock_init(&dp->dccps_minisock);
-
 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
-- 
cgit v1.2.3


From 0a4822679d94e2b0117aeead06a19fad59533905 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Initialisation and type-checking of feature sysctls

This patch takes care of initialising and type-checking sysctls related to
feature negotiation. Type checking is important since some of the sysctls
now directly act on the feature-negotiation process.

The sysctls are initialised with the known default values for each feature.
For the type-checking the value constraints from RFC 4340 are used:

 * Sequence Window uses the specified Wmin=32, the maximum is ulong (4 bytes),
   tested and confirmed that it works up to 4294967295 - for Gbps speed;
 * Ack Ratio is between 0 .. 0xffff (2-byte unsigned integer);
 * CCIDs are between 0 .. 255;
 * request_retries, retries1, retries2 also between 0..255 for good measure;
 * tx_qlen is checked to be non-negative;
 * sync_ratelimit remains as before.

Further changes:
----------------
Performed s@sysctl_dccp_feat@sysctl_dccp@g since the sysctls are now in feat.c.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h    |  3 ---
 net/dccp/feat.c    | 11 ++++++++---
 net/dccp/feat.h    |  8 ++++++++
 net/dccp/options.c |  4 ----
 net/dccp/sysctl.c  | 43 ++++++++++++++++++++++++++++++-------------
 5 files changed, 46 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 6101ecdb85b..4f681f1a16c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -95,9 +95,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
 extern int  sysctl_dccp_request_retries;
 extern int  sysctl_dccp_retries1;
 extern int  sysctl_dccp_retries2;
-extern int  sysctl_dccp_feat_sequence_window;
-extern int  sysctl_dccp_feat_rx_ccid;
-extern int  sysctl_dccp_feat_tx_ccid;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 84346599985..4c95cbdb0e0 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -26,6 +26,11 @@
 #include "ccid.h"
 #include "feat.h"
 
+/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
+unsigned long	sysctl_dccp_sequence_window __read_mostly = 100;
+int		sysctl_dccp_rx_ccid	    __read_mostly = 2,
+		sysctl_dccp_tx_ccid	    __read_mostly = 2;
+
 /*
  * Feature activation handlers.
  *
@@ -1141,7 +1146,7 @@ int dccp_feat_init(struct sock *sk)
 
 	/* Non-negotiable (NN) features */
 	rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
-				    sysctl_dccp_feat_sequence_window);
+				    sysctl_dccp_sequence_window);
 	if (rc)
 		return rc;
 
@@ -1172,8 +1177,8 @@ int dccp_feat_init(struct sock *sk)
 	if (ccid_request_modules(tx.val, tx.len))
 		goto free_ccid_lists;
 
-	if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) ||
-	    !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len))
+	if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
+	    !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
 		goto free_ccid_lists;
 
 	rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index f73b47abca9..f8456bca4ee 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -99,6 +99,13 @@ struct ccid_dependency {
 	u8	val;
 };
 
+/*
+ * Sysctls to seed defaults for feature negotiation
+ */
+extern unsigned long sysctl_dccp_sequence_window;
+extern int	     sysctl_dccp_rx_ccid;
+extern int	     sysctl_dccp_tx_ccid;
+
 #ifdef CONFIG_IP_DCCP_DEBUG
 extern const char *dccp_feat_typename(const u8 type);
 extern const char *dccp_feat_name(const u8 feat);
@@ -113,6 +120,7 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 #endif /* CONFIG_IP_DCCP_DEBUG */
 
 extern int  dccp_feat_init(struct sock *sk);
+extern void dccp_feat_initialise_sysctls(void);
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  u8 const *list, u8 len);
 extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index aca309e1663..5906e96eedd 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,10 +23,6 @@
 #include "dccp.h"
 #include "feat.h"
 
-int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
-int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
 	u64 value = 0;
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 018e210875e..a5a1856234e 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -18,55 +18,72 @@
 #error This file should not be compiled without CONFIG_SYSCTL defined
 #endif
 
+/* Boundary values */
+static int		zero     = 0,
+			u8_max   = 0xFF;
+static unsigned long	seqw_min = 32;
+
 static struct ctl_table dccp_default_table[] = {
 	{
 		.procname	= "seq_window",
-		.data		= &sysctl_dccp_feat_sequence_window,
-		.maxlen		= sizeof(sysctl_dccp_feat_sequence_window),
+		.data		= &sysctl_dccp_sequence_window,
+		.maxlen		= sizeof(sysctl_dccp_sequence_window),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &seqw_min,		/* RFC 4340, 7.5.2 */
 	},
 	{
 		.procname	= "rx_ccid",
-		.data		= &sysctl_dccp_feat_rx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_feat_rx_ccid),
+		.data		= &sysctl_dccp_rx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
 		.procname	= "tx_ccid",
-		.data		= &sysctl_dccp_feat_tx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_feat_tx_ccid),
+		.data		= &sysctl_dccp_tx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
 		.maxlen		= sizeof(sysctl_dccp_request_retries),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
 	},
 	{
 		.procname	= "retries1",
 		.data		= &sysctl_dccp_retries1,
 		.maxlen		= sizeof(sysctl_dccp_retries1),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
 	},
 	{
 		.procname	= "retries2",
 		.data		= &sysctl_dccp_retries2,
 		.maxlen		= sizeof(sysctl_dccp_retries2),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
 	},
 	{
 		.procname	= "tx_qlen",
 		.data		= &sysctl_dccp_tx_qlen,
 		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
 	},
 	{
 		.procname	= "sync_ratelimit",
-- 
cgit v1.2.3


From 76f738a7950b559a23ab3c692c99a02f35a54f7f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Debugging functions for feature negotiation

Since all feature-negotiation processing now takes place in feat.c, functions
for producing verbose debugging output are concentrated there.

New functions to print out values, entry records, and options are provided,
and also a macro is defined to not always have the function name in the
output line.

Thanks a lot to Wei Yongjun and Giuseppe Galeota for help with errors in an
earlier revision of this patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h    |   2 +
 net/dccp/feat.c    | 153 +++++++++++++++++++++++++++++++++++++----------------
 net/dccp/feat.h    |  13 -----
 net/dccp/options.c |   4 --
 4 files changed, 109 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 4f681f1a16c..94ae6d41d72 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -42,9 +42,11 @@
 extern int dccp_debug;
 #define dccp_pr_debug(format, a...)	  DCCP_PR_DEBUG(dccp_debug, format, ##a)
 #define dccp_pr_debug_cat(format, a...)   DCCP_PRINTK(dccp_debug, format, ##a)
+#define dccp_debug(fmt, a...)		  dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
 #else
 #define dccp_pr_debug(format, a...)
 #define dccp_pr_debug_cat(format, a...)
+#define dccp_debug(format, a...)
 #endif
 
 extern struct inet_hashinfo dccp_hashinfo;
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 4c95cbdb0e0..3abacad0b22 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -204,6 +204,100 @@ static int dccp_feat_default_value(u8 feat_num)
 	return idx < 0 ? : dccp_feat_table[idx].default_value;
 }
 
+/*
+ *	Debugging and verbose-printing section
+ */
+static const char *dccp_feat_fname(const u8 feat)
+{
+	static const char *feature_names[] = {
+		[DCCPF_RESERVED]	= "Reserved",
+		[DCCPF_CCID]		= "CCID",
+		[DCCPF_SHORT_SEQNOS]	= "Allow Short Seqnos",
+		[DCCPF_SEQUENCE_WINDOW]	= "Sequence Window",
+		[DCCPF_ECN_INCAPABLE]	= "ECN Incapable",
+		[DCCPF_ACK_RATIO]	= "Ack Ratio",
+		[DCCPF_SEND_ACK_VECTOR]	= "Send ACK Vector",
+		[DCCPF_SEND_NDP_COUNT]	= "Send NDP Count",
+		[DCCPF_MIN_CSUM_COVER]	= "Min. Csum Coverage",
+		[DCCPF_DATA_CHECKSUM]	= "Send Data Checksum",
+	};
+	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
+		return feature_names[DCCPF_RESERVED];
+
+	if (feat ==  DCCPF_SEND_LEV_RATE)
+		return "Send Loss Event Rate";
+	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
+		return "CCID-specific";
+
+	return feature_names[feat];
+}
+
+static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING",
+					 "UNSTABLE", "STABLE" };
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_feat_oname(const u8 opt)
+{
+	switch (opt) {
+	case DCCPO_CHANGE_L:  return "Change_L";
+	case DCCPO_CONFIRM_L: return "Confirm_L";
+	case DCCPO_CHANGE_R:  return "Change_R";
+	case DCCPO_CONFIRM_R: return "Confirm_R";
+	}
+	return NULL;
+}
+
+static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
+{
+	u8 i, type = dccp_feat_type(feat_num);
+
+	if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
+		dccp_pr_debug_cat("(NULL)");
+	else if (type == FEAT_SP)
+		for (i = 0; i < val->sp.len; i++)
+			dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
+	else if (type == FEAT_NN)
+		dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
+	else
+		dccp_pr_debug_cat("unknown type %u", type);
+}
+
+static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
+{
+	u8 type = dccp_feat_type(feat_num);
+	dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
+
+	if (type == FEAT_NN)
+		fval.nn = dccp_decode_value_var(list, len);
+	dccp_feat_printval(feat_num, &fval);
+}
+
+static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
+{
+	dccp_debug("   * %s %s = ", entry->is_local ? "local" : "remote",
+				    dccp_feat_fname(entry->feat_num));
+	dccp_feat_printval(entry->feat_num, &entry->val);
+	dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
+			  entry->needs_confirm ? "(Confirm pending)" : "");
+}
+
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory)	do {	      \
+	dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
+	dccp_feat_printvals(feat, val, len);				      \
+	dccp_pr_debug_cat(") %s\n", mandatory ? "!" : "");	} while (0)
+
+#define dccp_feat_print_fnlist(fn_list)  {		\
+	const struct dccp_feat_entry *___entry;		\
+							\
+	dccp_pr_debug("List Dump:\n");			\
+	list_for_each_entry(___entry, fn_list, node)	\
+		dccp_feat_print_entry(___entry);	\
+}
+#else	/* ! CONFIG_IP_DCCP_DEBUG */
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
+#define dccp_feat_print_fnlist(fn_list)
+#endif
+
 static int __dccp_feat_activate(struct sock *sk, const int idx,
 				const bool is_local, dccp_feat_val const *fval)
 {
@@ -236,6 +330,10 @@ static int __dccp_feat_activate(struct sock *sk, const int idx,
 	/* Location is RX if this is a local-RX or remote-TX feature */
 	rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
 
+	dccp_debug("   -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
+		   dccp_feat_fname(dccp_feat_table[idx].feat_num),
+		   fval ? "" : "default ",  (unsigned long long)val);
+
 	return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
 }
 
@@ -539,6 +637,7 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
 				return -1;
 			}
 		}
+		dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
 
 		if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
 			return -1;
@@ -792,6 +891,7 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp)
 	while (i--)
 		if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
 			return -1;
+	dccp_feat_print_fnlist(fn);
 	return 0;
 }
 
@@ -910,6 +1010,8 @@ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
 	if (len == 0 || type == FEAT_UNKNOWN)		/* 6.1 and 6.6.8 */
 		goto unknown_feature_or_value;
 
+	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
 	/*
 	 *	Negotiation of NN features: Change R is invalid, so there is no
 	 *	simultaneous negotiation; hence we do not look up in the list.
@@ -1015,6 +1117,8 @@ static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
 	const bool local = (opt == DCCPO_CONFIRM_R);
 	struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
 
+	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
 	if (entry == NULL) {	/* nothing queued: ignore or handle error */
 		if (is_mandatory && type == FEAT_UNKNOWN)
 			return DCCP_RESET_CODE_MANDATORY_ERROR;
@@ -1217,9 +1321,10 @@ int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
 			goto activation_failed;
 		}
 		if (cur->state != FEAT_STABLE) {
-			DCCP_CRIT("Negotiation of %s %u failed in state %u",
+			DCCP_CRIT("Negotiation of %s %s failed in state %s",
 				  cur->is_local ? "local" : "remote",
-				  cur->feat_num, cur->state);
+				  dccp_feat_fname(cur->feat_num),
+				  dccp_feat_sname[cur->state]);
 			goto activation_failed;
 		}
 		fvals[idx][cur->is_local] = &cur->val;
@@ -1260,47 +1365,3 @@ activation_failed:
 	dp->dccps_hc_rx_ackvec = NULL;
 	return -1;
 }
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-const char *dccp_feat_typename(const u8 type)
-{
-	switch(type) {
-	case DCCPO_CHANGE_L:  return("ChangeL");
-	case DCCPO_CONFIRM_L: return("ConfirmL");
-	case DCCPO_CHANGE_R:  return("ChangeR");
-	case DCCPO_CONFIRM_R: return("ConfirmR");
-	/* the following case must not appear in feature negotation  */
-	default:	      dccp_pr_debug("unknown type %d [BUG!]\n", type);
-	}
-	return NULL;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_typename);
-
-const char *dccp_feat_name(const u8 feat)
-{
-	static const char *feature_names[] = {
-		[DCCPF_RESERVED]	= "Reserved",
-		[DCCPF_CCID]		= "CCID",
-		[DCCPF_SHORT_SEQNOS]	= "Allow Short Seqnos",
-		[DCCPF_SEQUENCE_WINDOW]	= "Sequence Window",
-		[DCCPF_ECN_INCAPABLE]	= "ECN Incapable",
-		[DCCPF_ACK_RATIO]	= "Ack Ratio",
-		[DCCPF_SEND_ACK_VECTOR]	= "Send ACK Vector",
-		[DCCPF_SEND_NDP_COUNT]	= "Send NDP Count",
-		[DCCPF_MIN_CSUM_COVER]	= "Min. Csum Coverage",
-		[DCCPF_DATA_CHECKSUM]	= "Send Data Checksum",
-	};
-	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
-		return feature_names[DCCPF_RESERVED];
-
-	if (feat ==  DCCPF_SEND_LEV_RATE)
-		return "Send Loss Event Rate";
-	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
-		return "CCID-specific";
-
-	return feature_names[feat];
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_name);
-#endif /* CONFIG_IP_DCCP_DEBUG */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index f8456bca4ee..2217066e22d 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -106,19 +106,6 @@ extern unsigned long sysctl_dccp_sequence_window;
 extern int	     sysctl_dccp_rx_ccid;
 extern int	     sysctl_dccp_tx_ccid;
 
-#ifdef CONFIG_IP_DCCP_DEBUG
-extern const char *dccp_feat_typename(const u8 type);
-extern const char *dccp_feat_name(const u8 feat);
-
-static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
-{
-	dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type),
-					   dccp_feat_name(feat), feat, val);
-}
-#else
-#define dccp_feat_debug(type, feat, val)
-#endif /* CONFIG_IP_DCCP_DEBUG */
-
 extern int  dccp_feat_init(struct sock *sk);
 extern void dccp_feat_initialise_sysctls(void);
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 5906e96eedd..fd51cc70c63 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -498,10 +498,6 @@ int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
 		*to++ = *val;
 	if (len)
 		memcpy(to, val, len);
-
-	dccp_pr_debug("%s(%s (%d), ...), length %d\n",
-		      dccp_feat_typename(type),
-		      dccp_feat_name(feat), feat, len);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 624a965a93610152b10c73d050ed44812efa8abe Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Support for the exchange of NN options in established state

In contrast to static feature negotiation at the begin of a connection, which
establishes the capabilities of both endpoints, this patch introduces support
for dynamic exchange of feature negotiation options.

Such a dynamic exchange is necessary in at least two cases:
 * CCID-2's Ack Ratio (RFC 4341, 6.1.2) which changes during the connection;
 * Sequence Window values that, as per RFC 4340, 7.5.2, should be sent "as
   as the connection progresses".

Both are NN (non-negotiable) features. Hence dynamic feature "negotiation" is
distinguished from static/pre-connection negotiation by the following:
 * no new capabilities are negotiated (those that matter for the connection
   are negotiated prior to setting up the connection, comparable to SIP);
 * features must be understood by each endpoint: as per RFC 4340, 6.4,
   Sequence Window is "Req'd" and Ack Ratio must be understood when CCID-2
   is used as per the note underneath Table 4.

These characteristics are reflected in the implementation:
 * only NN options can be exchanged after connection setup;
 * NN options are activated directly after validating them. The rationale is
   that a peer must accept every valid NN value (RFC 4340, 6.3.2), hence it
   will either accept the value and send a "Confirm R", or it will send an
   empty Confirm (which will reset the connection according to FN rules).
 * An Ack is scheduled directly after activation to accelerate communicating
   the update to the peer.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/dccp.h |  1 +
 net/dccp/feat.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 94ae6d41d72..0a07b2e24e6 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -436,6 +436,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
 	       inet_csk_ack_scheduled(sk);
 }
 
+extern int  dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
 extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
 extern int  dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
 extern int  dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 3abacad0b22..5be8b85ac74 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -13,6 +13,7 @@
  *  -----------
  *  o Feature negotiation is coordinated with connection setup (as in TCP), wild
  *    changes of parameters of an established connection are not supported.
+ *  o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN.
  *  o All currently known SP features have 1-byte quantities. If in the future
  *    extensions of RFCs 4340..42 define features with item lengths larger than
  *    one byte, a feature-specific extension of the code will be required.
@@ -337,6 +338,20 @@ static int __dccp_feat_activate(struct sock *sk, const int idx,
 	return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
 }
 
+/**
+ * dccp_feat_activate  -  Activate feature value on socket
+ * @sk: fully connected DCCP socket (after handshake is complete)
+ * @feat_num: feature to activate, one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @fval: the value (SP or NN) to activate, or NULL to use the default value
+ * For general use this function is preferable over __dccp_feat_activate().
+ */
+static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
+			      dccp_feat_val const *fval)
+{
+	return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
+}
+
 /* Test for "Req'd" feature (RFC 4340, 6.4) */
 static inline int dccp_feat_must_be_understood(u8 feat_num)
 {
@@ -734,6 +749,48 @@ int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
 	return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
 }
 
+/**
+ * dccp_feat_signal_nn_change  -  Update NN values for an established connection
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ * @nn_val: the new value to use
+ * This function is used to communicate NN updates out-of-band. The difference
+ * to feature negotiation during connection setup is that values are activated
+ * immediately after validation, i.e. we don't wait for the Confirm: either the
+ * value is accepted by the peer (and then the waiting is futile), or it is not
+ * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values
+ * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2).
+ */
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
+{
+	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+	dccp_feat_val fval = { .nn = nn_val };
+	struct dccp_feat_entry *entry;
+
+	if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
+		return 0;
+
+	if (dccp_feat_type(feat) != FEAT_NN ||
+	    !dccp_feat_is_valid_nn_val(feat, nn_val))
+		return -EINVAL;
+
+	entry = dccp_feat_list_lookup(fn, feat, 1);
+	if (entry != NULL) {
+		dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n",
+			      (unsigned long long)nn_val,
+			      (unsigned long long)entry->val.nn,
+			      dccp_feat_sname[entry->state]);
+		return 0;
+	}
+
+	if (dccp_feat_activate(sk, feat, 1, &fval))
+		return -EADV;
+
+	inet_csk_schedule_ack(sk);
+	return dccp_feat_push_change(fn, feat, 1, 0, &fval);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
+
 /*
  *	Tracking features whose value depend on the choice of CCID
  *
-- 
cgit v1.2.3


From 4861a354430d2ea36847ef88086c7449b4f385b6 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Support for exchanging of NN options in established state

This patch provides support for the reception of NN options in (PART)OPEN state.

It is a combination of change_recv() and confirm_recv(), specifically geared
towards receiving the `fast-path' NN options.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 5be8b85ac74..c847c80d1b9 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1246,6 +1246,93 @@ confirmation_failed:
 			    : DCCP_RESET_CODE_OPTION_ERROR;
 }
 
+/**
+ * dccp_feat_handle_nn_established  -  Fast-path reception of NN options
+ * @sk:		socket of an established DCCP connection
+ * @mandatory:	whether @opt was preceded by a Mandatory option
+ * @opt:	%DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
+ * @feat:	NN number, one of %dccp_feature_numbers
+ * @val:	NN value
+ * @len:	length of @val in bytes
+ * This function combines the functionality of change_recv/confirm_recv, with
+ * the following differences (reset codes are the same):
+ *    - cleanup after receiving the Confirm;
+ *    - values are directly activated after successful parsing;
+ *    - deliberately restricted to NN features.
+ * The restriction to NN features is essential since SP features can have non-
+ * predictable outcomes (depending on the remote configuration), and are inter-
+ * dependent (CCIDs for instance cause further dependencies).
+ */
+static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
+					  u8 feat, u8 *val, u8 len)
+{
+	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+	const bool local = (opt == DCCPO_CONFIRM_R);
+	struct dccp_feat_entry *entry;
+	u8 type = dccp_feat_type(feat);
+	dccp_feat_val fval;
+
+	dccp_feat_print_opt(opt, feat, val, len, mandatory);
+
+	/* Ignore non-mandatory unknown and non-NN features */
+	if (type == FEAT_UNKNOWN) {
+		if (local && !mandatory)
+			return 0;
+		goto fast_path_unknown;
+	} else if (type != FEAT_NN) {
+		return 0;
+	}
+
+	/*
+	 * We don't accept empty Confirms, since in fast-path feature
+	 * negotiation the values are enabled immediately after sending
+	 * the Change option.
+	 * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
+	 */
+	if (len == 0 || len > sizeof(fval.nn))
+		goto fast_path_unknown;
+
+	if (opt == DCCPO_CHANGE_L) {
+		fval.nn = dccp_decode_value_var(val, len);
+		if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+			goto fast_path_unknown;
+
+		if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
+		    dccp_feat_activate(sk, feat, local, &fval))
+			return DCCP_RESET_CODE_TOO_BUSY;
+
+		/* set the `Ack Pending' flag to piggyback a Confirm */
+		inet_csk_schedule_ack(sk);
+
+	} else if (opt == DCCPO_CONFIRM_R) {
+		entry = dccp_feat_list_lookup(fn, feat, local);
+		if (entry == NULL || entry->state != FEAT_CHANGING)
+			return 0;
+
+		fval.nn = dccp_decode_value_var(val, len);
+		if (fval.nn != entry->val.nn) {
+			DCCP_WARN("Bogus Confirm for non-existing value\n");
+			goto fast_path_failed;
+		}
+
+		/* It has been confirmed - so remove the entry */
+		dccp_feat_list_pop(entry);
+
+	} else {
+		DCCP_WARN("Received illegal option %u\n", opt);
+		goto fast_path_failed;
+	}
+	return 0;
+
+fast_path_unknown:
+	if (!mandatory)
+		return dccp_push_empty_confirm(fn, feat, local);
+
+fast_path_failed:
+	return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+			 : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
 /**
  * dccp_feat_parse_options  -  Process Feature-Negotiation Options
  * @sk: for general use and used by the client during connection setup
@@ -1281,6 +1368,15 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
 						      val, len, server);
 		}
+		break;
+	/*
+	 *	Support for exchanging NN options on an established connection
+	 *	This is currently restricted to Ack Ratio (RFC 4341, 6.1.2)
+	 */
+	case DCCP_OPEN:
+	case DCCP_PARTOPEN:
+		return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
+						       val, len);
 	}
 	return 0;	/* ignore FN options in all other states */
 }
-- 
cgit v1.2.3


From 2faae5587f692fd5c79856ca4c4b90944ee0472a Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Use feature-negotiation to report Ack Ratio changes

This uses the new feature-negotiation framework to signal Ack Ratio changes,
as required by RFC 4341, sec. 6.1.2.

This raises some problems for CCID-2 since it can at the moment not cope
gracefully with Ack Ratio of e.g. 2. A FIXME has thus been added which
reverts to the existing policy of bypassing the Ack Ratio sysctl.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/feat.c  | 12 ++++++++++++
 net/dccp/proto.c |  1 -
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index c847c80d1b9..f94c7c9d1a7 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -74,6 +74,18 @@ static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
 
 static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
 {
+#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__
+	/*
+	 * FIXME: This is required until several problems in the CCID-2 code are
+	 * resolved. The CCID-2 code currently does not cope well; using dynamic
+	 * Ack Ratios greater than 1 caused instabilities. These were manifest
+	 * in hangups and long RTO timeouts (1...3 seconds). Until this has been
+	 * stabilised, it is safer not to activate dynamic Ack Ratio changes.
+	 */
+	dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n",
+		      rx ? "RX" : "TX", (u16)ratio);
+	ratio = 1;
+#endif
 	if (rx)
 		dccp_sk(sk)->dccps_r_ack_ratio = ratio;
 	else
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 392a5d822b3..11905e0cf8f 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -189,7 +189,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	dp->dccps_rate_last	= jiffies;
 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
-	dp->dccps_l_ack_ratio	= dp->dccps_r_ack_ratio = 1;
 
 	dccp_init_xmit_timers(sk);
 
-- 
cgit v1.2.3


From 55ebe3ab2d504bd3f3eeade0262826210019abda Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Leave headroom for options when calculating the MPS

The Maximum Packet Size (MPS) is of interest for applications which want
to transfer data, so it is only relevant to the data transfer phase of a
connection (unless one wants to send data on the DCCP-Request, but that is
not considered here).

The strategy chosen to deal with this requirement is to leave room for only
such options that may appear on data packets.

A special consideration applies to Ack Vectors: this is purely guesswork,
since these can have any length between 3 and 1020 bytes. The strategy
chosen here is to subtract a configurable minimum, the value of 16 bytes
(2 bytes for type/length plus 14 Ack Vector cells) has been found by
experimentatation. If people experience this as too much or too little,
this could later be turned into a Kconfig option.

There are currently no CCID-specific header options which may appear on data
packets, hence it is not necessary to define a corresponding CCID field.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/ackvec.h |  3 +++
 net/dccp/output.c | 22 ++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 4ccee030524..1c10814fdf7 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -20,6 +20,9 @@
 /* We can spread an ack vector across multiple options */
 #define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
 
+/* Estimated minimum average Ack Vector length - used for updating MPS */
+#define DCCPAV_MIN_OPTLEN	16
+
 #define DCCP_ACKVEC_STATE_RECEIVED	0
 #define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
 #define DCCP_ACKVEC_STATE_NOT_RECEIVED	(3 << 6)
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 19a93d5433a..0aab919dafe 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -161,21 +161,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
 	u32 ccmps = dccp_determine_ccmps(dp);
-	int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+	u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
 
 	/* Account for header lengths and IPv4/v6 option overhead */
 	cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
 		    sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
 
 	/*
-	 * FIXME: this should come from the CCID infrastructure, where, say,
-	 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
-	 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
-	 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
-	 * make it a multiple of 4
+	 * Leave enough headroom for common DCCP header options.
+	 * This only considers options which may appear on DCCP-Data packets, as
+	 * per table 3 in RFC 4340, 5.8. When running out of space for other
+	 * options (eg. Ack Vector which can take up to 255 bytes), it is better
+	 * to schedule a separate Ack. Thus we leave headroom for the following:
+	 *  - 1 byte for Slow Receiver (11.6)
+	 *  - 6 bytes for Timestamp (13.1)
+	 *  - 10 bytes for Timestamp Echo (13.3)
+	 *  - 8 bytes for NDP count (7.7, when activated)
+	 *  - 6 bytes for Data Checksum (9.3)
+	 *  - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
 	 */
-
-	cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
+	cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
+			   (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
 
 	/* And store cached results */
 	icsk->icsk_pmtu_cookie = pmtu;
-- 
cgit v1.2.3


From 88ddac513a4e7e04234214b600401ec22abfbb46 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Special case of the MPS for client-PARTOPEN with DataAcks

To increase robustness, it is necessary to resend Confirm feature-negotiation
options, even though the RFC does not mandate it. But feature negotiation
options can take (much) more room than the options on common DataAck packets.

Instead of reducing the MPS always for a case which only applies to the three
messages send during initial handshake, this patch devises a special case:

   if the payload length of the DataAck in PARTOPEN is too large, an Ack is sent
   to carry the options, and the feature-negotiation list is then flushed.

   This means that the server gets two Acks for one Response. If both Acks get
   lost, it is probably better to restart the connection anyway and devising yet
   another special-case does not seem worth the extra complexity.

The patch (over-)estimates the expected overhead to be 32*4 bytes -- commonly
seen values were 20-90 bytes for initial feature-negotiation options.

It uses sizeof(u32) to mean "aligned units of 4 bytes". For consistency,
another use of sizeof is modified.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/dccp.h   |  5 ++++-
 net/dccp/output.c | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 0a07b2e24e6..c7370de4c4d 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -63,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
  *    - DCCP-Reset    with ACK Subheader and 4 bytes of Reset Code fields
  *  Hence a safe upper bound for the maximum option length is 1020-28 = 992
  */
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
 #define DCCP_MAX_PACKET_HDR 28
 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
 #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
 
+/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
+#define DCCP_FEATNEG_OVERHEAD	 (32 * sizeof(uint32_t))
+
 #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
 				     * state, about 60 seconds */
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 0aab919dafe..9888f61f9b0 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -276,7 +276,20 @@ void dccp_write_xmit(struct sock *sk, int block)
 			const int len = skb->len;
 
 			if (sk->sk_state == DCCP_PARTOPEN) {
-				/* See 8.1.5.  Handshake Completion */
+				const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+				/*
+				 * See 8.1.5 - Handshake Completion.
+				 *
+				 * For robustness we resend Confirm options until the client has
+				 * entered OPEN. During the initial feature negotiation, the MPS
+				 * is smaller than usual, reduced by the Change/Confirm options.
+				 */
+				if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+					DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+					dccp_send_ack(sk);
+					dccp_feat_list_purge(&dp->dccps_featneg);
+				}
+
 				inet_csk_schedule_ack(sk);
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						  inet_csk(sk)->icsk_rto,
-- 
cgit v1.2.3


From 1fb87509606cb19f5f603e54c28af7da149049f3 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Remove ccid2hc{tx,rx}_ prefixes

This patch fixes two problems caused by the ubiquitous long "hctx->ccid2htx_"
and "hcrx->ccid2hcrx_" prefixes:
 * code becomes hard to read;
 * multiple-line statements are almost inevitable even for simple expressions;
The prefixes are not really necessary (compare with "struct tcp_sock").

There had been previous discussion of this on dccp@vger, but so far this was
not followed up (most people agreed that the prefixes are too long).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Leandro Melo de Sales <leandroal@gmail.com>
---
 net/dccp/ccids/ccid2.c | 274 ++++++++++++++++++++++++-------------------------
 net/dccp/ccids/ccid2.h |  48 ++++-----
 2 files changed, 159 insertions(+), 163 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index c9ea19a4d85..9728bbf0ace 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -39,16 +39,16 @@ static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
 {
 	int len = 0;
 	int pipe = 0;
-	struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
+	struct ccid2_seq *seqp = hctx->seqh;
 
 	/* there is data in the chain */
-	if (seqp != hctx->ccid2hctx_seqt) {
+	if (seqp != hctx->seqt) {
 		seqp = seqp->ccid2s_prev;
 		len++;
 		if (!seqp->ccid2s_acked)
 			pipe++;
 
-		while (seqp != hctx->ccid2hctx_seqt) {
+		while (seqp != hctx->seqt) {
 			struct ccid2_seq *prev = seqp->ccid2s_prev;
 
 			len++;
@@ -65,16 +65,16 @@ static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
 		}
 	}
 
-	BUG_ON(pipe != hctx->ccid2hctx_pipe);
+	BUG_ON(pipe != hctx->pipe);
 	ccid2_pr_debug("len of chain=%d\n", len);
 
 	do {
 		seqp = seqp->ccid2s_prev;
 		len++;
-	} while (seqp != hctx->ccid2hctx_seqh);
+	} while (seqp != hctx->seqh);
 
 	ccid2_pr_debug("total len=%d\n", len);
-	BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
+	BUG_ON(len != hctx->seqbufc * CCID2_SEQBUF_LEN);
 }
 #else
 #define ccid2_pr_debug(format, a...)
@@ -87,8 +87,7 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
 	int i;
 
 	/* check if we have space to preserve the pointer to the buffer */
-	if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) /
-					sizeof(struct ccid2_seq*)))
+	if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *))
 		return -ENOMEM;
 
 	/* allocate buffer and initialize linked list */
@@ -104,20 +103,20 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
 	seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
 
 	/* This is the first allocation.  Initiate the head and tail.  */
-	if (hctx->ccid2hctx_seqbufc == 0)
-		hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp;
+	if (hctx->seqbufc == 0)
+		hctx->seqh = hctx->seqt = seqp;
 	else {
 		/* link the existing list with the one we just created */
-		hctx->ccid2hctx_seqh->ccid2s_next = seqp;
-		seqp->ccid2s_prev = hctx->ccid2hctx_seqh;
+		hctx->seqh->ccid2s_next = seqp;
+		seqp->ccid2s_prev = hctx->seqh;
 
-		hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
-		seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt;
+		hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+		seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt;
 	}
 
 	/* store the original pointer to the buffer so we can free it */
-	hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp;
-	hctx->ccid2hctx_seqbufc++;
+	hctx->seqbuf[hctx->seqbufc] = seqp;
+	hctx->seqbufc++;
 
 	return 0;
 }
@@ -126,7 +125,7 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd)
+	if (hctx->pipe < hctx->cwnd)
 		return 0;
 
 	return 1; /* XXX CCID should dequeue when ready instead of polling */
@@ -135,7 +134,7 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2);
+	u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2);
 
 	/*
 	 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -160,7 +159,7 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
 {
 	ccid2_pr_debug("change SRTT to %ld\n", val);
-	hctx->ccid2hctx_srtt = val;
+	hctx->srtt = val;
 }
 
 static void ccid2_start_rto_timer(struct sock *sk);
@@ -173,8 +172,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
-		sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
-			       jiffies + HZ / 5);
+		sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5);
 		goto out;
 	}
 
@@ -183,28 +181,28 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 	ccid2_hc_tx_check_sanity(hctx);
 
 	/* back-off timer */
-	hctx->ccid2hctx_rto <<= 1;
+	hctx->rto <<= 1;
 
-	s = hctx->ccid2hctx_rto / HZ;
+	s = hctx->rto / HZ;
 	if (s > 60)
-		hctx->ccid2hctx_rto = 60 * HZ;
+		hctx->rto = 60 * HZ;
 
 	ccid2_start_rto_timer(sk);
 
 	/* adjust pipe, cwnd etc */
-	hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2;
-	if (hctx->ccid2hctx_ssthresh < 2)
-		hctx->ccid2hctx_ssthresh = 2;
-	hctx->ccid2hctx_cwnd	 = 1;
-	hctx->ccid2hctx_pipe	 = 0;
+	hctx->ssthresh = hctx->cwnd / 2;
+	if (hctx->ssthresh < 2)
+		hctx->ssthresh = 2;
+	hctx->cwnd = 1;
+	hctx->pipe = 0;
 
 	/* clear state about stuff we sent */
-	hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
-	hctx->ccid2hctx_packets_acked = 0;
+	hctx->seqt = hctx->seqh;
+	hctx->packets_acked = 0;
 
 	/* clear ack ratio state. */
-	hctx->ccid2hctx_rpseq	 = 0;
-	hctx->ccid2hctx_rpdupack = -1;
+	hctx->rpseq    = 0;
+	hctx->rpdupack = -1;
 	ccid2_change_l_ack_ratio(sk, 1);
 	ccid2_hc_tx_check_sanity(hctx);
 out:
@@ -216,11 +214,11 @@ static void ccid2_start_rto_timer(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
+	ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->rto);
 
-	BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
-	sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
-		       jiffies + hctx->ccid2hctx_rto);
+	BUG_ON(timer_pending(&hctx->rtotimer));
+	sk_reset_timer(sk, &hctx->rtotimer,
+		       jiffies + hctx->rto);
 }
 
 static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
@@ -229,27 +227,26 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 	struct ccid2_seq *next;
 
-	hctx->ccid2hctx_pipe++;
+	hctx->pipe++;
 
-	hctx->ccid2hctx_seqh->ccid2s_seq   = dp->dccps_gss;
-	hctx->ccid2hctx_seqh->ccid2s_acked = 0;
-	hctx->ccid2hctx_seqh->ccid2s_sent  = jiffies;
+	hctx->seqh->ccid2s_seq   = dp->dccps_gss;
+	hctx->seqh->ccid2s_acked = 0;
+	hctx->seqh->ccid2s_sent  = jiffies;
 
-	next = hctx->ccid2hctx_seqh->ccid2s_next;
+	next = hctx->seqh->ccid2s_next;
 	/* check if we need to alloc more space */
-	if (next == hctx->ccid2hctx_seqt) {
+	if (next == hctx->seqt) {
 		if (ccid2_hc_tx_alloc_seq(hctx)) {
 			DCCP_CRIT("packet history - out of memory!");
 			/* FIXME: find a more graceful way to bail out */
 			return;
 		}
-		next = hctx->ccid2hctx_seqh->ccid2s_next;
-		BUG_ON(next == hctx->ccid2hctx_seqt);
+		next = hctx->seqh->ccid2s_next;
+		BUG_ON(next == hctx->seqt);
 	}
-	hctx->ccid2hctx_seqh = next;
+	hctx->seqh = next;
 
-	ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
-		       hctx->ccid2hctx_pipe);
+	ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe);
 
 	/*
 	 * FIXME: The code below is broken and the variables have been removed
@@ -272,12 +269,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
 	 */
 #if 0
 	/* Ack Ratio.  Need to maintain a concept of how many windows we sent */
-	hctx->ccid2hctx_arsent++;
+	hctx->arsent++;
 	/* We had an ack loss in this window... */
-	if (hctx->ccid2hctx_ackloss) {
-		if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
-			hctx->ccid2hctx_arsent	= 0;
-			hctx->ccid2hctx_ackloss	= 0;
+	if (hctx->ackloss) {
+		if (hctx->arsent >= hctx->cwnd) {
+			hctx->arsent  = 0;
+			hctx->ackloss = 0;
 		}
 	} else {
 		/* No acks lost up to now... */
@@ -287,28 +284,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
 			int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
 				    dp->dccps_l_ack_ratio;
 
-			denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
+			denom = hctx->cwnd * hctx->cwnd / denom;
 
-			if (hctx->ccid2hctx_arsent >= denom) {
+			if (hctx->arsent >= denom) {
 				ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
-				hctx->ccid2hctx_arsent = 0;
+				hctx->arsent = 0;
 			}
 		} else {
 			/* we can't increase ack ratio further [1] */
-			hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
+			hctx->arsent = 0; /* or maybe set it to cwnd*/
 		}
 	}
 #endif
 
 	/* setup RTO timer */
-	if (!timer_pending(&hctx->ccid2hctx_rtotimer))
+	if (!timer_pending(&hctx->rtotimer))
 		ccid2_start_rto_timer(sk);
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 	do {
-		struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
+		struct ccid2_seq *seqp = hctx->seqt;
 
-		while (seqp != hctx->ccid2hctx_seqh) {
+		while (seqp != hctx->seqh) {
 			ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
 				       (unsigned long long)seqp->ccid2s_seq,
 				       seqp->ccid2s_acked, seqp->ccid2s_sent);
@@ -386,7 +383,7 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer);
+	sk_stop_timer(sk, &hctx->rtotimer);
 	ccid2_pr_debug("deleted RTO timer\n");
 }
 
@@ -396,73 +393,73 @@ static inline void ccid2_new_ack(struct sock *sk,
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
-		if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) {
-			hctx->ccid2hctx_cwnd += 1;
-			*maxincr	     -= 1;
-			hctx->ccid2hctx_packets_acked = 0;
+	if (hctx->cwnd < hctx->ssthresh) {
+		if (*maxincr > 0 && ++hctx->packets_acked == 2) {
+			hctx->cwnd += 1;
+			*maxincr   -= 1;
+			hctx->packets_acked = 0;
 		}
-	} else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) {
-			hctx->ccid2hctx_cwnd += 1;
-			hctx->ccid2hctx_packets_acked = 0;
+	} else if (++hctx->packets_acked >= hctx->cwnd) {
+			hctx->cwnd += 1;
+			hctx->packets_acked = 0;
 	}
 
 	/* update RTO */
-	if (hctx->ccid2hctx_srtt == -1 ||
-	    time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) {
+	if (hctx->srtt == -1 ||
+	    time_after(jiffies, hctx->lastrtt + hctx->srtt)) {
 		unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
 		int s;
 
 		/* first measurement */
-		if (hctx->ccid2hctx_srtt == -1) {
+		if (hctx->srtt == -1) {
 			ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
 				       r, jiffies,
 				       (unsigned long long)seqp->ccid2s_seq);
 			ccid2_change_srtt(hctx, r);
-			hctx->ccid2hctx_rttvar = r >> 1;
+			hctx->rttvar = r >> 1;
 		} else {
 			/* RTTVAR */
-			long tmp = hctx->ccid2hctx_srtt - r;
+			long tmp = hctx->srtt - r;
 			long srtt;
 
 			if (tmp < 0)
 				tmp *= -1;
 
 			tmp >>= 2;
-			hctx->ccid2hctx_rttvar *= 3;
-			hctx->ccid2hctx_rttvar >>= 2;
-			hctx->ccid2hctx_rttvar += tmp;
+			hctx->rttvar *= 3;
+			hctx->rttvar >>= 2;
+			hctx->rttvar += tmp;
 
 			/* SRTT */
-			srtt = hctx->ccid2hctx_srtt;
+			srtt = hctx->srtt;
 			srtt *= 7;
 			srtt >>= 3;
 			tmp = r >> 3;
 			srtt += tmp;
 			ccid2_change_srtt(hctx, srtt);
 		}
-		s = hctx->ccid2hctx_rttvar << 2;
+		s = hctx->rttvar << 2;
 		/* clock granularity is 1 when based on jiffies */
 		if (!s)
 			s = 1;
-		hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
+		hctx->rto = hctx->srtt + s;
 
 		/* must be at least a second */
-		s = hctx->ccid2hctx_rto / HZ;
+		s = hctx->rto / HZ;
 		/* DCCP doesn't require this [but I like it cuz my code sux] */
 #if 1
 		if (s < 1)
-			hctx->ccid2hctx_rto = HZ;
+			hctx->rto = HZ;
 #endif
 		/* max 60 seconds */
 		if (s > 60)
-			hctx->ccid2hctx_rto = HZ * 60;
+			hctx->rto = HZ * 60;
 
-		hctx->ccid2hctx_lastrtt = jiffies;
+		hctx->lastrtt = jiffies;
 
 		ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
-			       hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
-			       hctx->ccid2hctx_rto, HZ, r);
+			       hctx->srtt, hctx->rttvar,
+			       hctx->rto, HZ, r);
 	}
 
 	/* we got a new ack, so re-start RTO timer */
@@ -474,12 +471,12 @@ static void ccid2_hc_tx_dec_pipe(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (hctx->ccid2hctx_pipe == 0)
+	if (hctx->pipe == 0)
 		DCCP_BUG("pipe == 0");
 	else
-		hctx->ccid2hctx_pipe--;
+		hctx->pipe--;
 
-	if (hctx->ccid2hctx_pipe == 0)
+	if (hctx->pipe == 0)
 		ccid2_hc_tx_kill_rto_timer(sk);
 }
 
@@ -487,19 +484,19 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
+	if (time_before(seqp->ccid2s_sent, hctx->last_cong)) {
 		ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
 		return;
 	}
 
-	hctx->ccid2hctx_last_cong = jiffies;
+	hctx->last_cong = jiffies;
 
-	hctx->ccid2hctx_cwnd     = hctx->ccid2hctx_cwnd / 2 ? : 1U;
-	hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U);
+	hctx->cwnd     = hctx->cwnd / 2 ? : 1U;
+	hctx->ssthresh = max(hctx->cwnd, 2U);
 
 	/* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
-	if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd)
-		ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd);
+	if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd)
+		ccid2_change_l_ack_ratio(sk, hctx->cwnd);
 }
 
 static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -523,21 +520,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	 * -sorbo.
 	 */
 	/* need to bootstrap */
-	if (hctx->ccid2hctx_rpdupack == -1) {
-		hctx->ccid2hctx_rpdupack = 0;
-		hctx->ccid2hctx_rpseq = seqno;
+	if (hctx->rpdupack == -1) {
+		hctx->rpdupack = 0;
+		hctx->rpseq = seqno;
 	} else {
 		/* check if packet is consecutive */
-		if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1)
-			hctx->ccid2hctx_rpseq = seqno;
+		if (dccp_delta_seqno(hctx->rpseq, seqno) == 1)
+			hctx->rpseq = seqno;
 		/* it's a later packet */
-		else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
-			hctx->ccid2hctx_rpdupack++;
+		else if (after48(seqno, hctx->rpseq)) {
+			hctx->rpdupack++;
 
 			/* check if we got enough dupacks */
-			if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) {
-				hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
-				hctx->ccid2hctx_rpseq = 0;
+			if (hctx->rpdupack >= NUMDUPACK) {
+				hctx->rpdupack = -1; /* XXX lame */
+				hctx->rpseq = 0;
 
 				ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
 			}
@@ -546,7 +543,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
 	/* check forward path congestion */
 	/* still didn't send out new data packets */
-	if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
+	if (hctx->seqh == hctx->seqt)
 		return;
 
 	switch (DCCP_SKB_CB(skb)->dccpd_type) {
@@ -558,14 +555,14 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
-	if (after48(ackno, hctx->ccid2hctx_high_ack))
-		hctx->ccid2hctx_high_ack = ackno;
+	if (after48(ackno, hctx->high_ack))
+		hctx->high_ack = ackno;
 
-	seqp = hctx->ccid2hctx_seqt;
+	seqp = hctx->seqt;
 	while (before48(seqp->ccid2s_seq, ackno)) {
 		seqp = seqp->ccid2s_next;
-		if (seqp == hctx->ccid2hctx_seqh) {
-			seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
+		if (seqp == hctx->seqh) {
+			seqp = hctx->seqh->ccid2s_prev;
 			break;
 		}
 	}
@@ -575,7 +572,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	 * packets per acknowledgement. Rounding up avoids that cwnd is not
 	 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
 	 */
-	if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh)
+	if (hctx->cwnd < hctx->ssthresh)
 		maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
 
 	/* go through all ack vectors */
@@ -594,7 +591,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			 * seqnos.
 			 */
 			while (after48(seqp->ccid2s_seq, ackno)) {
-				if (seqp == hctx->ccid2hctx_seqt) {
+				if (seqp == hctx->seqt) {
 					done = 1;
 					break;
 				}
@@ -626,7 +623,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 						       (unsigned long long)seqp->ccid2s_seq);
 					ccid2_hc_tx_dec_pipe(sk);
 				}
-				if (seqp == hctx->ccid2hctx_seqt) {
+				if (seqp == hctx->seqt) {
 					done = 1;
 					break;
 				}
@@ -645,11 +642,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	/* The state about what is acked should be correct now
 	 * Check for NUMDUPACK
 	 */
-	seqp = hctx->ccid2hctx_seqt;
-	while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) {
+	seqp = hctx->seqt;
+	while (before48(seqp->ccid2s_seq, hctx->high_ack)) {
 		seqp = seqp->ccid2s_next;
-		if (seqp == hctx->ccid2hctx_seqh) {
-			seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
+		if (seqp == hctx->seqh) {
+			seqp = hctx->seqh->ccid2s_prev;
 			break;
 		}
 	}
@@ -660,7 +657,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			if (done == NUMDUPACK)
 				break;
 		}
-		if (seqp == hctx->ccid2hctx_seqt)
+		if (seqp == hctx->seqt)
 			break;
 		seqp = seqp->ccid2s_prev;
 	}
@@ -683,20 +680,20 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				ccid2_congestion_event(sk, seqp);
 				ccid2_hc_tx_dec_pipe(sk);
 			}
-			if (seqp == hctx->ccid2hctx_seqt)
+			if (seqp == hctx->seqt)
 				break;
 			seqp = seqp->ccid2s_prev;
 		}
 
-		hctx->ccid2hctx_seqt = last_acked;
+		hctx->seqt = last_acked;
 	}
 
 	/* trim acked packets in tail */
-	while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
-		if (!hctx->ccid2hctx_seqt->ccid2s_acked)
+	while (hctx->seqt != hctx->seqh) {
+		if (!hctx->seqt->ccid2s_acked)
 			break;
 
-		hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
+		hctx->seqt = hctx->seqt->ccid2s_next;
 	}
 
 	ccid2_hc_tx_check_sanity(hctx);
@@ -709,17 +706,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	u32 max_ratio;
 
 	/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
-	hctx->ccid2hctx_ssthresh  = ~0U;
+	hctx->ssthresh = ~0U;
 
 	/*
 	 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
 	 * packets for new connections, following the rules from [RFC3390]".
 	 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
 	 */
-	hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
+	hctx->cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
 
 	/* Make sure that Ack Ratio is enabled and within bounds. */
-	max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2);
+	max_ratio = DIV_ROUND_UP(hctx->cwnd, 2);
 	if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
 		dp->dccps_l_ack_ratio = max_ratio;
 
@@ -727,13 +724,12 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	if (ccid2_hc_tx_alloc_seq(hctx))
 		return -ENOMEM;
 
-	hctx->ccid2hctx_rto	 = 3 * HZ;
+	hctx->rto	 = 3 * HZ;
 	ccid2_change_srtt(hctx, -1);
-	hctx->ccid2hctx_rttvar	 = -1;
-	hctx->ccid2hctx_rpdupack = -1;
-	hctx->ccid2hctx_last_cong = jiffies;
-	setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
-			(unsigned long)sk);
+	hctx->rttvar	= -1;
+	hctx->rpdupack  = -1;
+	hctx->last_cong = jiffies;
+	setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
 
 	ccid2_hc_tx_check_sanity(hctx);
 	return 0;
@@ -746,9 +742,9 @@ static void ccid2_hc_tx_exit(struct sock *sk)
 
 	ccid2_hc_tx_kill_rto_timer(sk);
 
-	for (i = 0; i < hctx->ccid2hctx_seqbufc; i++)
-		kfree(hctx->ccid2hctx_seqbuf[i]);
-	hctx->ccid2hctx_seqbufc = 0;
+	for (i = 0; i < hctx->seqbufc; i++)
+		kfree(hctx->seqbuf[i]);
+	hctx->seqbufc = 0;
 }
 
 static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -759,10 +755,10 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	switch (DCCP_SKB_CB(skb)->dccpd_type) {
 	case DCCP_PKT_DATA:
 	case DCCP_PKT_DATAACK:
-		hcrx->ccid2hcrx_data++;
-		if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
+		hcrx->data++;
+		if (hcrx->data >= dp->dccps_r_ack_ratio) {
 			dccp_send_ack(sk);
-			hcrx->ccid2hcrx_data = 0;
+			hcrx->data = 0;
 		}
 		break;
 	}
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 2c94ca02901..d7815804bce 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,34 +42,34 @@ struct ccid2_seq {
 
 /** struct ccid2_hc_tx_sock - CCID2 TX half connection
  *
- * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
- * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
- * @ccid2hctx_lastrtt -time RTT was last measured
- * @ccid2hctx_rpseq - last consecutive seqno
- * @ccid2hctx_rpdupack - dupacks since rpseq
-*/
+ * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
+ * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
+ * @lastrtt: time RTT was last measured
+ * @rpseq: last consecutive seqno
+ * @rpdupack: dupacks since rpseq
+ */
 struct ccid2_hc_tx_sock {
-	u32			ccid2hctx_cwnd;
-	u32			ccid2hctx_ssthresh;
-	u32			ccid2hctx_pipe;
-	u32			ccid2hctx_packets_acked;
-	struct ccid2_seq	*ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
-	int			ccid2hctx_seqbufc;
-	struct ccid2_seq	*ccid2hctx_seqh;
-	struct ccid2_seq	*ccid2hctx_seqt;
-	long			ccid2hctx_rto;
-	long			ccid2hctx_srtt;
-	long			ccid2hctx_rttvar;
-	unsigned long		ccid2hctx_lastrtt;
-	struct timer_list	ccid2hctx_rtotimer;
-	u64			ccid2hctx_rpseq;
-	int			ccid2hctx_rpdupack;
-	unsigned long		ccid2hctx_last_cong;
-	u64			ccid2hctx_high_ack;
+	u32			cwnd;
+	u32			ssthresh;
+	u32			pipe;
+	u32			packets_acked;
+	struct ccid2_seq	*seqbuf[CCID2_SEQBUF_MAX];
+	int			seqbufc;
+	struct ccid2_seq	*seqh;
+	struct ccid2_seq	*seqt;
+	long			rto;
+	long			srtt;
+	long			rttvar;
+	unsigned long		lastrtt;
+	struct timer_list	rtotimer;
+	u64			rpseq;
+	int			rpdupack;
+	unsigned long		last_cong;
+	u64			high_ack;
 };
 
 struct ccid2_hc_rx_sock {
-	int	ccid2hcrx_data;
+	int			data;
 };
 
 static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
-- 
cgit v1.2.3


From 842d1ef14ff37e9611eab479f31a0d74c1a5c4c0 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Remove ccid3hc{tx,rx}_ prefixes

This patch does the same for CCID-3 as the previous patch for CCID-2:

        s#ccid3hctx_##g;
        s#ccid3hcrx_##g;

plus manual editing to retain consistency.

Please note: expanded the fields of the `struct tfrc_tx_info' in the hc_tx_sock,
since using short #define identifiers is not a good idea. The only place where
this embedded struct was used is ccid3_hc_tx_getsockopt().

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 337 +++++++++++++++++++++++--------------------------
 net/dccp/ccids/ccid3.h | 119 +++++++++--------
 net/dccp/probe.c       |   6 +-
 3 files changed, 220 insertions(+), 242 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 3b8bd7ca676..2f026ce2714 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -67,13 +67,13 @@ static void ccid3_hc_tx_set_state(struct sock *sk,
 				  enum ccid3_hc_tx_states state)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+	enum ccid3_hc_tx_states oldstate = hctx->state;
 
 	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
 		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
 		       ccid3_tx_state_name(state));
 	WARN_ON(state == oldstate);
-	hctx->ccid3hctx_state = state;
+	hctx->state = state;
 }
 
 /*
@@ -88,10 +88,9 @@ static void ccid3_hc_tx_set_state(struct sock *sk,
 static inline u64 rfc3390_initial_rate(struct sock *sk)
 {
 	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	const __u32 w_init = clamp_t(__u32, 4380U,
-			2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
+	const __u32 w_init = clamp_t(__u32, 4380U, 2 * hctx->s, 4 * hctx->s);
 
-	return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
+	return scaled_div(w_init << 6, hctx->rtt);
 }
 
 /*
@@ -100,24 +99,20 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
 {
 	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
-	hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
-					     hctx->ccid3hctx_x);
+	hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
 
 	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
-	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
-					   TFRC_OPSYS_HALF_TIME_GRAN);
-
-	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
-		       hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
-		       hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
+	hctx->delta = min_t(u32, hctx->t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN);
 
+	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hctx->t_ipi,
+		       hctx->delta, hctx->s, (unsigned)(hctx->x >> 6));
 }
 
 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
 {
-	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
+	u32 delta = ktime_us_delta(now, hctx->t_last_win_count);
 
-	return delta / hctx->ccid3hctx_rtt;
+	return delta / hctx->rtt;
 }
 
 /**
@@ -133,8 +128,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
 static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	__u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
-	const  __u64 old_x = hctx->ccid3hctx_x;
+	u64 min_rate = 2 * hctx->x_recv;
+	const u64 old_x = hctx->x;
 	ktime_t now = stamp ? *stamp : ktime_get_real();
 
 	/*
@@ -145,33 +140,27 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 	 */
 	if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
 		min_rate = rfc3390_initial_rate(sk);
-		min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
+		min_rate = max(min_rate, 2 * hctx->x_recv);
 	}
 
-	if (hctx->ccid3hctx_p > 0) {
+	if (hctx->p > 0) {
 
-		hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
-					min_rate);
-		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
-					(((__u64)hctx->ccid3hctx_s) << 6) /
-								TFRC_T_MBI);
+		hctx->x = min(((u64)hctx->x_calc) << 6, min_rate);
+		hctx->x = max(hctx->x, (((u64)hctx->s) << 6) / TFRC_T_MBI);
 
-	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
-				- (s64)hctx->ccid3hctx_rtt >= 0) {
+	} else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) {
 
-		hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
-		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
-			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
-				       hctx->ccid3hctx_rtt));
-		hctx->ccid3hctx_t_ld = now;
+		hctx->x = min(2 * hctx->x, min_rate);
+		hctx->x = max(hctx->x,
+			      scaled_div(((u64)hctx->s) << 6, hctx->rtt));
+		hctx->t_ld = now;
 	}
 
-	if (hctx->ccid3hctx_x != old_x) {
+	if (hctx->x != old_x) {
 		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
 			       "X_recv=%u\n", (unsigned)(old_x >> 6),
-			       (unsigned)(hctx->ccid3hctx_x >> 6),
-			       hctx->ccid3hctx_x_calc,
-			       (unsigned)(hctx->ccid3hctx_x_recv >> 6));
+			       (unsigned)(hctx->x >> 6), hctx->x_calc,
+			       (unsigned)(hctx->x_recv >> 6));
 
 		ccid3_update_send_interval(hctx);
 	}
@@ -183,11 +172,11 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
  */
 static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
 {
-	const u16 old_s = hctx->ccid3hctx_s;
+	const u16 old_s = hctx->s;
 
-	hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
+	hctx->s = tfrc_ewma(hctx->s, len, 9);
 
-	if (hctx->ccid3hctx_s != old_s)
+	if (hctx->s != old_s)
 		ccid3_update_send_interval(hctx);
 }
 
@@ -198,13 +187,13 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
 static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
 						ktime_t now)
 {
-	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
-	    quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
+	u32 delta = ktime_us_delta(now, hctx->t_last_win_count),
+	    quarter_rtts = (4 * delta) / hctx->rtt;
 
 	if (quarter_rtts > 0) {
-		hctx->ccid3hctx_t_last_win_count = now;
-		hctx->ccid3hctx_last_win_count  += min(quarter_rtts, 5U);
-		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
+		hctx->t_last_win_count = now;
+		hctx->last_win_count  += min(quarter_rtts, 5U);
+		hctx->last_win_count  &= 0xF;		/* mod 16 */
 	}
 }
 
@@ -222,23 +211,21 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 	}
 
 	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
-		       ccid3_tx_state_name(hctx->ccid3hctx_state));
+		       ccid3_tx_state_name(hctx->state));
 
-	if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
+	if (hctx->state == TFRC_SSTATE_FBACK)
 		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-	else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+	else if (hctx->state != TFRC_SSTATE_NO_FBACK)
 		goto out;
 
 	/*
 	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+	 * RTO is 0 if and only if no feedback has been received yet.
 	 */
-	if (hctx->ccid3hctx_t_rto == 0 ||	/* no feedback received yet */
-	    hctx->ccid3hctx_p == 0) {
+	if (hctx->t_rto == 0 || hctx->p == 0) {
 
 		/* halve send rate directly */
-		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
-					(((__u64)hctx->ccid3hctx_s) << 6) /
-								    TFRC_T_MBI);
+		hctx->x = max(hctx->x / 2, (((u64)hctx->s) << 6) / TFRC_T_MBI);
 		ccid3_update_send_interval(hctx);
 	} else {
 		/*
@@ -251,33 +238,32 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		 *
 		 *  Note that X_recv is scaled by 2^6 while X_calc is not
 		 */
-		BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
+		BUG_ON(hctx->p && !hctx->x_calc);
 
-		if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
-			hctx->ccid3hctx_x_recv =
-				max(hctx->ccid3hctx_x_recv / 2,
-				    (((__u64)hctx->ccid3hctx_s) << 6) /
-							      (2 * TFRC_T_MBI));
+		if (hctx->x_calc > (hctx->x_recv >> 5))
+			hctx->x_recv =
+				max(hctx->x_recv / 2,
+				    (((__u64)hctx->s) << 6) / (2 * TFRC_T_MBI));
 		else {
-			hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
-			hctx->ccid3hctx_x_recv <<= 4;
+			hctx->x_recv = hctx->x_calc;
+			hctx->x_recv <<= 4;
 		}
 		ccid3_hc_tx_update_x(sk, NULL);
 	}
 	ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
-			(unsigned long long)hctx->ccid3hctx_x);
+			(unsigned long long)hctx->x);
 
 	/*
 	 * Set new timeout for the nofeedback timer.
 	 * See comments in packet_recv() regarding the value of t_RTO.
 	 */
-	if (unlikely(hctx->ccid3hctx_t_rto == 0))	/* no feedback yet */
+	if (unlikely(hctx->t_rto == 0))		/* no feedback received yet */
 		t_nfb = TFRC_INITIAL_TIMEOUT;
 	else
-		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
+		t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
 
 restart_timer:
-	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+	sk_reset_timer(sk, &hctx->no_feedback_timer,
 			   jiffies + usecs_to_jiffies(t_nfb));
 out:
 	bh_unlock_sock(sk);
@@ -305,18 +291,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(skb->len == 0))
 		return -EBADMSG;
 
-	switch (hctx->ccid3hctx_state) {
+	switch (hctx->state) {
 	case TFRC_SSTATE_NO_SENT:
-		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
-			       (jiffies +
+		sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
 				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
-		hctx->ccid3hctx_last_win_count	 = 0;
-		hctx->ccid3hctx_t_last_win_count = now;
+		hctx->last_win_count   = 0;
+		hctx->t_last_win_count = now;
 
 		/* Set t_0 for initial packet */
-		hctx->ccid3hctx_t_nom = now;
+		hctx->t_nom = now;
 
-		hctx->ccid3hctx_s = skb->len;
+		hctx->s = skb->len;
 
 		/*
 		 * Use initial RTT sample when available: recommended by erratum
@@ -325,9 +310,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (dp->dccps_syn_rtt) {
 			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
-			hctx->ccid3hctx_rtt  = dp->dccps_syn_rtt;
-			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
-			hctx->ccid3hctx_t_ld = now;
+			hctx->rtt  = dp->dccps_syn_rtt;
+			hctx->x    = rfc3390_initial_rate(sk);
+			hctx->t_ld = now;
 		} else {
 			/*
 			 * Sender does not have RTT sample:
@@ -335,9 +320,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 			 *   is needed in several parts (e.g.  window counter);
 			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
 			 */
-			hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
-			hctx->ccid3hctx_x   = hctx->ccid3hctx_s;
-			hctx->ccid3hctx_x <<= 6;
+			hctx->rtt = DCCP_FALLBACK_RTT;
+			hctx->x   = hctx->s;
+			hctx->x <<= 6;
 		}
 		ccid3_update_send_interval(hctx);
 
@@ -345,7 +330,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		break;
 	case TFRC_SSTATE_NO_FBACK:
 	case TFRC_SSTATE_FBACK:
-		delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
+		delay = ktime_us_delta(hctx->t_nom, now);
 		ccid3_pr_debug("delay=%ld\n", (long)delay);
 		/*
 		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
@@ -355,7 +340,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 * else
 		 *       // send the packet in (t_nom - t_now) milliseconds.
 		 */
-		if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
+		if (delay - (s64)hctx->delta >= 1000)
 			return (u32)delay / 1000L;
 
 		ccid3_hc_tx_update_win_count(hctx, now);
@@ -367,11 +352,10 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 
 	/* prepare to send now (add options etc.) */
 	dp->dccps_hc_tx_insert_options = 1;
-	DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
+	DCCP_SKB_CB(skb)->dccpd_ccval  = hctx->last_win_count;
 
 	/* set the nominal send time for the next following packet */
-	hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
-					     hctx->ccid3hctx_t_ipi);
+	hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi);
 	return 0;
 }
 
@@ -382,14 +366,14 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
 
 	ccid3_hc_tx_update_s(hctx, len);
 
-	if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
+	if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss))
 		DCCP_CRIT("packet history - out of memory!");
 }
 
 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct ccid3_options_received *opt_recv;
+	struct ccid3_options_received *opt_recv = &hctx->options_received;
 	ktime_t now;
 	unsigned long t_nfb;
 	u32 pinv, r_sample;
@@ -399,15 +383,14 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
 		return;
 	/* ... and only in the established state */
-	if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
-	    hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+	if (hctx->state != TFRC_SSTATE_FBACK &&
+	    hctx->state != TFRC_SSTATE_NO_FBACK)
 		return;
 
-	opt_recv = &hctx->ccid3hctx_options_received;
 	now = ktime_get_real();
 
 	/* Estimate RTT from history if ACK number is valid */
-	r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
+	r_sample = tfrc_tx_hist_rtt(hctx->hist,
 				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
 	if (r_sample == 0) {
 		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
@@ -417,37 +400,37 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Update receive rate in units of 64 * bytes/second */
-	hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
-	hctx->ccid3hctx_x_recv <<= 6;
+	hctx->x_recv = opt_recv->ccid3or_receive_rate;
+	hctx->x_recv <<= 6;
 
 	/* Update loss event rate (which is scaled by 1e6) */
 	pinv = opt_recv->ccid3or_loss_event_rate;
 	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
-		hctx->ccid3hctx_p = 0;
+		hctx->p = 0;
 	else				       /* can not exceed 100% */
-		hctx->ccid3hctx_p = scaled_div(1, pinv);
+		hctx->p = scaled_div(1, pinv);
 	/*
 	 * Validate new RTT sample and update moving average
 	 */
 	r_sample = dccp_sample_rtt(sk, r_sample);
-	hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
+	hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
 	 */
-	if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+	if (hctx->state == TFRC_SSTATE_NO_FBACK) {
 		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
 
-		if (hctx->ccid3hctx_t_rto == 0) {
+		if (hctx->t_rto == 0) {
 			/*
 			 * Initial feedback packet: Larger Initial Windows (4.2)
 			 */
-			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
-			hctx->ccid3hctx_t_ld = now;
+			hctx->x    = rfc3390_initial_rate(sk);
+			hctx->t_ld = now;
 
 			ccid3_update_send_interval(hctx);
 
 			goto done_computing_x;
-		} else if (hctx->ccid3hctx_p == 0) {
+		} else if (hctx->p == 0) {
 			/*
 			 * First feedback after nofeedback timer expiry (4.3)
 			 */
@@ -456,25 +439,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
-	if (hctx->ccid3hctx_p > 0)
-		hctx->ccid3hctx_x_calc =
-				tfrc_calc_x(hctx->ccid3hctx_s,
-					    hctx->ccid3hctx_rtt,
-					    hctx->ccid3hctx_p);
+	if (hctx->p > 0)
+		hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p);
 	ccid3_hc_tx_update_x(sk, &now);
 
 done_computing_x:
 	ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
 			       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
-			       dccp_role(sk),
-			       sk, hctx->ccid3hctx_rtt, r_sample,
-			       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
-			       hctx->ccid3hctx_x_calc,
-			       (unsigned)(hctx->ccid3hctx_x_recv >> 6),
-			       (unsigned)(hctx->ccid3hctx_x >> 6));
+			       dccp_role(sk), sk, hctx->rtt, r_sample,
+			       hctx->s, hctx->p, hctx->x_calc,
+			       (unsigned)(hctx->x_recv >> 6),
+			       (unsigned)(hctx->x >> 6));
 
 	/* unschedule no feedback timer */
-	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+	sk_stop_timer(sk, &hctx->no_feedback_timer);
 
 	/*
 	 * As we have calculated new ipi, delta, t_nom it is possible
@@ -488,21 +466,19 @@ done_computing_x:
 	 * This can help avoid triggering the nofeedback timer too
 	 * often ('spinning') on LANs with small RTTs.
 	 */
-	hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
-					   (CONFIG_IP_DCCP_CCID3_RTO *
-					    (USEC_PER_SEC / 1000)));
+	hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO *
+						 (USEC_PER_SEC / 1000)));
 	/*
 	 * Schedule no feedback timer to expire in
 	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
 	 */
-	t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
+	t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
 
 	ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
 		       "expire in %lu jiffies (%luus)\n",
-		       dccp_role(sk),
-		       sk, usecs_to_jiffies(t_nfb), t_nfb);
+		       dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
 
-	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+	sk_reset_timer(sk, &hctx->no_feedback_timer,
 			   jiffies + usecs_to_jiffies(t_nfb));
 }
 
@@ -513,11 +489,9 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
 	int rc = 0;
 	const struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct ccid3_options_received *opt_recv;
+	struct ccid3_options_received *opt_recv = &hctx->options_received;
 	__be32 opt_val;
 
-	opt_recv = &hctx->ccid3hctx_options_received;
-
 	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
 		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
 		opt_recv->ccid3or_loss_event_rate    = ~0;
@@ -572,11 +546,10 @@ static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
 
-	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
-	hctx->ccid3hctx_hist = NULL;
-	setup_timer(&hctx->ccid3hctx_no_feedback_timer,
-			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
-
+	hctx->state = TFRC_SSTATE_NO_SENT;
+	hctx->hist  = NULL;
+	setup_timer(&hctx->no_feedback_timer,
+		    ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
 	return 0;
 }
 
@@ -585,9 +558,9 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
 	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
-	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+	sk_stop_timer(sk, &hctx->no_feedback_timer);
 
-	tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
+	tfrc_tx_hist_purge(&hctx->hist);
 }
 
 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
@@ -599,14 +572,15 @@ static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
 		return;
 
 	hctx = ccid3_hc_tx_sk(sk);
-	info->tcpi_rto = hctx->ccid3hctx_t_rto;
-	info->tcpi_rtt = hctx->ccid3hctx_rtt;
+	info->tcpi_rto = hctx->t_rto;
+	info->tcpi_rtt = hctx->rtt;
 }
 
 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
 	const struct ccid3_hc_tx_sock *hctx;
+	struct tfrc_tx_info tfrc;
 	const void *val;
 
 	/* Listen socks doesn't have a private CCID block */
@@ -616,10 +590,17 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 	hctx = ccid3_hc_tx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_TX_INFO:
-		if (len < sizeof(hctx->ccid3hctx_tfrc))
+		if (len < sizeof(tfrc))
 			return -EINVAL;
-		len = sizeof(hctx->ccid3hctx_tfrc);
-		val = &hctx->ccid3hctx_tfrc;
+		tfrc.tfrctx_x	   = hctx->x;
+		tfrc.tfrctx_x_recv = hctx->x_recv;
+		tfrc.tfrctx_x_calc = hctx->x_calc;
+		tfrc.tfrctx_rtt	   = hctx->rtt;
+		tfrc.tfrctx_p	   = hctx->p;
+		tfrc.tfrctx_rto	   = hctx->t_rto;
+		tfrc.tfrctx_ipi	   = hctx->t_ipi;
+		len = sizeof(tfrc);
+		val = &tfrc;
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -660,13 +641,13 @@ static void ccid3_hc_rx_set_state(struct sock *sk,
 				  enum ccid3_hc_rx_states state)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+	enum ccid3_hc_rx_states oldstate = hcrx->state;
 
 	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
 		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
 		       ccid3_rx_state_name(state));
 	WARN_ON(state == oldstate);
-	hcrx->ccid3hcrx_state = state;
+	hcrx->state = state;
 }
 
 static void ccid3_hc_rx_send_feedback(struct sock *sk,
@@ -678,15 +659,15 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 	ktime_t now;
 	s64 delta = 0;
 
-	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
+	if (unlikely(hcrx->state == TFRC_RSTATE_TERM))
 		return;
 
 	now = ktime_get_real();
 
 	switch (fbtype) {
 	case CCID3_FBACK_INITIAL:
-		hcrx->ccid3hcrx_x_recv = 0;
-		hcrx->ccid3hcrx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
+		hcrx->x_recv = 0;
+		hcrx->p_inverse = ~0U;   /* see RFC 4342, 8.5 */
 		break;
 	case CCID3_FBACK_PARAM_CHANGE:
 		/*
@@ -699,27 +680,26 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 		 * the number of bytes since last feedback.
 		 * This is a safe fallback, since X is bounded above by X_calc.
 		 */
-		if (hcrx->ccid3hcrx_x_recv > 0)
+		if (hcrx->x_recv > 0)
 			break;
 		/* fall through */
 	case CCID3_FBACK_PERIODIC:
-		delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
+		delta = ktime_us_delta(now, hcrx->tstamp_last_feedback);
 		if (delta <= 0)
 			DCCP_BUG("delta (%ld) <= 0", (long)delta);
 		else
-			hcrx->ccid3hcrx_x_recv =
-				scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
+			hcrx->x_recv = scaled_div32(hcrx->bytes_recv, delta);
 		break;
 	default:
 		return;
 	}
 
-	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
-		       hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
+	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n",
+		       (long)delta, hcrx->x_recv, hcrx->p_inverse);
 
-	hcrx->ccid3hcrx_tstamp_last_feedback = now;
-	hcrx->ccid3hcrx_last_counter	     = dccp_hdr(skb)->dccph_ccval;
-	hcrx->ccid3hcrx_bytes_recv	     = 0;
+	hcrx->tstamp_last_feedback = now;
+	hcrx->last_counter	   = dccp_hdr(skb)->dccph_ccval;
+	hcrx->bytes_recv	   = 0;
 
 	dp->dccps_hc_rx_insert_options = 1;
 	dccp_send_ack(sk);
@@ -738,8 +718,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 	if (dccp_packet_without_ack(skb))
 		return 0;
 
-	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
-	pinv   = htonl(hcrx->ccid3hcrx_pinv);
+	x_recv = htonl(hcrx->x_recv);
+	pinv   = htonl(hcrx->p_inverse);
 
 	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
 			       &pinv, sizeof(pinv)) ||
@@ -765,22 +745,23 @@ static u32 ccid3_first_li(struct sock *sk)
 	u32 x_recv, p, delta;
 	u64 fval;
 
-	if (hcrx->ccid3hcrx_rtt == 0) {
+	if (hcrx->rtt == 0) {
 		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
-		hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
+		hcrx->rtt = DCCP_FALLBACK_RTT;
 	}
 
-	delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
-	x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
+	delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback));
+	x_recv = scaled_div32(hcrx->bytes_recv, delta);
 	if (x_recv == 0) {		/* would also trigger divide-by-zero */
 		DCCP_WARN("X_recv==0\n");
-		if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
+		if (hcrx->x_recv == 0) {
 			DCCP_BUG("stored value of X_recv is zero");
 			return ~0U;
 		}
+		x_recv = hcrx->x_recv;
 	}
 
-	fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
+	fval = scaled_div(hcrx->s, hcrx->rtt);
 	fval = scaled_div32(fval, x_recv);
 	p = tfrc_calc_x_reverse_lookup(fval);
 
@@ -797,14 +778,14 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
 	const bool is_data_packet = dccp_data_packet(skb);
 
-	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
+	if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) {
 		if (is_data_packet) {
 			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
 			do_feedback = CCID3_FBACK_INITIAL;
 			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
-			hcrx->ccid3hcrx_s = payload;
+			hcrx->s = payload;
 			/*
-			 * Not necessary to update ccid3hcrx_bytes_recv here,
+			 * Not necessary to update bytes_recv here,
 			 * since X_recv = 0 for the first feedback packet (cf.
 			 * RFC 3448, 6.3) -- gerrit
 			 */
@@ -812,7 +793,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		goto update_records;
 	}
 
-	if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
+	if (tfrc_rx_hist_duplicate(&hcrx->hist, skb))
 		return; /* done receiving */
 
 	if (is_data_packet) {
@@ -820,20 +801,20 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		/*
 		 * Update moving-average of s and the sum of received payload bytes
 		 */
-		hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
-		hcrx->ccid3hcrx_bytes_recv += payload;
+		hcrx->s = tfrc_ewma(hcrx->s, payload, 9);
+		hcrx->bytes_recv += payload;
 	}
 
 	/*
 	 * Perform loss detection and handle pending losses
 	 */
-	if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist,
+	if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist,
 				skb, ndp, ccid3_first_li, sk)) {
 		do_feedback = CCID3_FBACK_PARAM_CHANGE;
 		goto done_receiving;
 	}
 
-	if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
+	if (tfrc_rx_hist_loss_pending(&hcrx->hist))
 		return; /* done receiving */
 
 	/*
@@ -842,17 +823,17 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(!is_data_packet))
 		goto update_records;
 
-	if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
-		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
+	if (!tfrc_lh_is_initialised(&hcrx->li_hist)) {
+		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->hist, skb);
 		/*
 		 * Empty loss history: no loss so far, hence p stays 0.
 		 * Sample RTT values, since an RTT estimate is required for the
 		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
 		 */
 		if (sample != 0)
-			hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
+			hcrx->rtt = tfrc_ewma(hcrx->rtt, sample, 9);
 
-	} else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
+	} else if (tfrc_lh_update_i_mean(&hcrx->li_hist, skb)) {
 		/*
 		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
 		 * has decreased (resp. p has increased), send feedback now.
@@ -863,11 +844,11 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	/*
 	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
 	 */
-	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
+	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
 		do_feedback = CCID3_FBACK_PERIODIC;
 
 update_records:
-	tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
+	tfrc_rx_hist_add_packet(&hcrx->hist, skb, ndp);
 
 done_receiving:
 	if (do_feedback)
@@ -878,9 +859,9 @@ static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
 
-	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
-	tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
-	return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
+	hcrx->state = TFRC_RSTATE_NO_DATA;
+	tfrc_lh_init(&hcrx->li_hist);
+	return tfrc_rx_hist_alloc(&hcrx->hist);
 }
 
 static void ccid3_hc_rx_exit(struct sock *sk)
@@ -889,8 +870,8 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 
 	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
 
-	tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
-	tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
+	tfrc_rx_hist_purge(&hcrx->hist);
+	tfrc_lh_cleanup(&hcrx->li_hist);
 }
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
@@ -902,9 +883,9 @@ static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 		return;
 
 	hcrx = ccid3_hc_rx_sk(sk);
-	info->tcpi_ca_state = hcrx->ccid3hcrx_state;
+	info->tcpi_ca_state = hcrx->state;
 	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
-	info->tcpi_rcv_rtt  = hcrx->ccid3hcrx_rtt;
+	info->tcpi_rcv_rtt  = hcrx->rtt;
 }
 
 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
@@ -923,10 +904,10 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
 	case DCCP_SOCKOPT_CCID_RX_INFO:
 		if (len < sizeof(rx_info))
 			return -EINVAL;
-		rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
-		rx_info.tfrcrx_rtt    = hcrx->ccid3hcrx_rtt;
-		rx_info.tfrcrx_p      = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
-					   scaled_div(1, hcrx->ccid3hcrx_pinv);
+		rx_info.tfrcrx_x_recv = hcrx->x_recv;
+		rx_info.tfrcrx_rtt    = hcrx->rtt;
+		rx_info.tfrcrx_p      = hcrx->p_inverse == 0 ? ~0U :
+					   scaled_div(1, hcrx->p_inverse);
 		len = sizeof(rx_info);
 		val = &rx_info;
 		break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 49ca32bd7e7..0cfcfff8f5f 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -77,44 +77,43 @@ enum ccid3_hc_tx_states {
 
 /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
  *
- * @ccid3hctx_x - Current sending rate in 64 * bytes per second
- * @ccid3hctx_x_recv - Receive rate    in 64 * bytes per second
- * @ccid3hctx_x_calc - Calculated rate in bytes per second
- * @ccid3hctx_rtt - Estimate of current round trip time in usecs
- * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
- * @ccid3hctx_s - Packet size in bytes
- * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs
- * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
- * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
- * @ccid3hctx_last_win_count - Last window counter sent
- * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
- *				 with last_win_count value sent
- * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
- * @ccid3hctx_t_ld - Time last doubled during slow start
- * @ccid3hctx_t_nom - Nominal send time of next packet
- * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
- * @ccid3hctx_hist - Packet history
- * @ccid3hctx_options_received - Parsed set of retrieved options
+ * @x - Current sending rate in 64 * bytes per second
+ * @x_recv - Receive rate    in 64 * bytes per second
+ * @x_calc - Calculated rate in bytes per second
+ * @rtt - Estimate of current round trip time in usecs
+ * @p - Current loss event rate (0-1) scaled by 1000000
+ * @s - Packet size in bytes
+ * @t_rto - Nofeedback Timer setting in usecs
+ * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
+ * @state - Sender state, one of %ccid3_hc_tx_states
+ * @last_win_count - Last window counter sent
+ * @t_last_win_count - Timestamp of earliest packet with
+ *                     last_win_count value sent
+ * @no_feedback_timer - Handle to no feedback timer
+ * @t_ld - Time last doubled during slow start
+ * @t_nom - Nominal send time of next packet
+ * @delta - Send timer delta (RFC 3448, 4.6) in usecs
+ * @hist - Packet history
+ * @options_received - Parsed set of retrieved options
  */
 struct ccid3_hc_tx_sock {
-	struct tfrc_tx_info		ccid3hctx_tfrc;
-#define ccid3hctx_x			ccid3hctx_tfrc.tfrctx_x
-#define ccid3hctx_x_recv		ccid3hctx_tfrc.tfrctx_x_recv
-#define ccid3hctx_x_calc		ccid3hctx_tfrc.tfrctx_x_calc
-#define ccid3hctx_rtt			ccid3hctx_tfrc.tfrctx_rtt
-#define ccid3hctx_p			ccid3hctx_tfrc.tfrctx_p
-#define ccid3hctx_t_rto			ccid3hctx_tfrc.tfrctx_rto
-#define ccid3hctx_t_ipi			ccid3hctx_tfrc.tfrctx_ipi
-	u16				ccid3hctx_s;
-	enum ccid3_hc_tx_states		ccid3hctx_state:8;
-	u8				ccid3hctx_last_win_count;
-	ktime_t				ccid3hctx_t_last_win_count;
-	struct timer_list		ccid3hctx_no_feedback_timer;
-	ktime_t				ccid3hctx_t_ld;
-	ktime_t				ccid3hctx_t_nom;
-	u32				ccid3hctx_delta;
-	struct tfrc_tx_hist_entry	*ccid3hctx_hist;
-	struct ccid3_options_received	ccid3hctx_options_received;
+	u64				x;
+	u64				x_recv;
+	u32				x_calc;
+	u32				rtt;
+	u32				p;
+	u32				t_rto;
+	u32				t_ipi;
+	u16				s;
+	enum ccid3_hc_tx_states		state:8;
+	u8				last_win_count;
+	ktime_t				t_last_win_count;
+	struct timer_list		no_feedback_timer;
+	ktime_t				t_ld;
+	ktime_t				t_nom;
+	u32				delta;
+	struct tfrc_tx_hist_entry	*hist;
+	struct ccid3_options_received	options_received;
 };
 
 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -133,32 +132,32 @@ enum ccid3_hc_rx_states {
 
 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
  *
- *  @ccid3hcrx_x_recv  -  Receiver estimate of send rate (RFC 3448 4.3)
- *  @ccid3hcrx_rtt  -  Receiver estimate of rtt (non-standard)
- *  @ccid3hcrx_p  -  Current loss event rate (RFC 3448 5.4)
- *  @ccid3hcrx_last_counter  -  Tracks window counter (RFC 4342, 8.1)
- *  @ccid3hcrx_state  -  Receiver state, one of %ccid3_hc_rx_states
- *  @ccid3hcrx_bytes_recv  -  Total sum of DCCP payload bytes
- *  @ccid3hcrx_x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
- *  @ccid3hcrx_rtt  -  Receiver estimate of RTT
- *  @ccid3hcrx_tstamp_last_feedback  -  Time at which last feedback was sent
- *  @ccid3hcrx_tstamp_last_ack  -  Time at which last feedback was sent
- *  @ccid3hcrx_hist  -  Packet history (loss detection + RTT sampling)
- *  @ccid3hcrx_li_hist  -  Loss Interval database
- *  @ccid3hcrx_s  -  Received packet size in bytes
- *  @ccid3hcrx_pinv  -  Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
+ *  @x_recv  -  Receiver estimate of send rate (RFC 3448 4.3)
+ *  @rtt  -  Receiver estimate of rtt (non-standard)
+ *  @p  -  Current loss event rate (RFC 3448 5.4)
+ *  @last_counter  -  Tracks window counter (RFC 4342, 8.1)
+ *  @state  -  Receiver state, one of %ccid3_hc_rx_states
+ *  @bytes_recv  -  Total sum of DCCP payload bytes
+ *  @x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
+ *  @rtt  -  Receiver estimate of RTT
+ *  @tstamp_last_feedback  -  Time at which last feedback was sent
+ *  @tstamp_last_ack  -  Time at which last feedback was sent
+ *  @hist  -  Packet history (loss detection + RTT sampling)
+ *  @li_hist  -  Loss Interval database
+ *  @s  -  Received packet size in bytes
+ *  @p_inverse  -  Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
  */
 struct ccid3_hc_rx_sock {
-	u8				ccid3hcrx_last_counter:4;
-	enum ccid3_hc_rx_states		ccid3hcrx_state:8;
-	u32				ccid3hcrx_bytes_recv;
-	u32				ccid3hcrx_x_recv;
-	u32				ccid3hcrx_rtt;
-	ktime_t				ccid3hcrx_tstamp_last_feedback;
-	struct tfrc_rx_hist		ccid3hcrx_hist;
-	struct tfrc_loss_hist		ccid3hcrx_li_hist;
-	u16				ccid3hcrx_s;
-#define ccid3hcrx_pinv			ccid3hcrx_li_hist.i_mean
+	u8				last_counter:4;
+	enum ccid3_hc_rx_states		state:8;
+	u32				bytes_recv;
+	u32				x_recv;
+	u32				rtt;
+	ktime_t				tstamp_last_feedback;
+	struct tfrc_rx_hist		hist;
+	struct tfrc_loss_hist		li_hist;
+	u16				s;
+#define p_inverse			li_hist.i_mean
 };
 
 static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 9ca783d7467..a87fd4fc1b3 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -87,10 +87,8 @@ static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
 			       "%llu %llu %d\n",
 			       NIPQUAD(inet->saddr), ntohs(inet->sport),
 			       NIPQUAD(inet->daddr), ntohs(inet->dport), size,
-			       hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
-			       hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc,
-			       hctx->ccid3hctx_x_recv >> 6,
-			       hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi);
+			       hctx->s, hctx->rtt, hctx->p, hctx->x_calc,
+			       hctx->x_recv >> 6, hctx->x >> 6, hctx->t_ipi);
 		else
 			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
 			       NIPQUAD(inet->saddr), ntohs(inet->sport),
-- 
cgit v1.2.3


From b2e317f4b5ae73733963c702fae0f246d234100b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: No more CCID control blocks in LISTEN state

The CCIDs are activated as last of the features, at the end of the handshake,
were the LISTEN state of the master socket is inherited into the server
state of the child socket. Thus, the only states visible to CCIDs now are
OPEN/PARTOPEN, and the closing states.

This allows to remove tests which were previously necessary to protect
against referencing a socket in the listening state (in CCID3), but which
now have become redundant.

As a further byproduct of enabling the CCIDs only after the connection has been
fully established, several typecast-initialisations of ccid3_hc_{rx,tx}_sock
can now be eliminated:
 * the CCID is loaded, so it is not necessary to test if it is NULL,
 * if it is possible to load a CCID and leave the private area NULL, then this
    is a bug, which should crash loudly - and earlier,
 * the test for state==OPEN || state==PARTOPEN now reduces only to the closing
   phase (e.g. when the node has received an unexpected Reset).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/ccids/ccid3.c | 40 +++++++---------------------------------
 1 file changed, 7 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 2f026ce2714..b4cc62e2e2b 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -565,29 +565,17 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 
 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
 {
-	struct ccid3_hc_tx_sock *hctx;
-
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return;
-
-	hctx = ccid3_hc_tx_sk(sk);
-	info->tcpi_rto = hctx->t_rto;
-	info->tcpi_rtt = hctx->rtt;
+	info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto;
+	info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt;
 }
 
 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_tx_sock *hctx;
+	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 	struct tfrc_tx_info tfrc;
 	const void *val;
 
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return -EINVAL;
-
-	hctx = ccid3_hc_tx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_TX_INFO:
 		if (len < sizeof(tfrc))
@@ -707,14 +695,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 
 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
-	const struct ccid3_hc_rx_sock *hcrx;
+	const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 	__be32 x_recv, pinv;
 
 	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
 		return 0;
 
-	hcrx = ccid3_hc_rx_sk(sk);
-
 	if (dccp_packet_without_ack(skb))
 		return 0;
 
@@ -876,30 +862,18 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
-	const struct ccid3_hc_rx_sock *hcrx;
-
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return;
-
-	hcrx = ccid3_hc_rx_sk(sk);
-	info->tcpi_ca_state = hcrx->state;
+	info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->state;
 	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
-	info->tcpi_rcv_rtt  = hcrx->rtt;
+	info->tcpi_rcv_rtt  = ccid3_hc_rx_sk(sk)->rtt;
 }
 
 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_rx_sock *hcrx;
+	const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 	struct tfrc_rx_info rx_info;
 	const void *val;
 
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return -EINVAL;
-
-	hcrx = ccid3_hc_rx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_RX_INFO:
 		if (len < sizeof(rx_info))
-- 
cgit v1.2.3


From de6f2b59e5cd15a8772adb732a1d80e141a77115 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Bug fix for the inter-packet scheduling algorithm

This fixes a subtle bug in the calculation of the inter-packet gap and shows
that t_delta, as it is currently used, is not needed. And hence replaced.

The algorithm from RFC 3448, 4.6 below continually computes a send time t_nom,
which is initialised with the current time t_now; t_gran = 1E6 / HZ specifies
the scheduling granularity, s the packet size, and X the sending rate:

  t_distance = t_nom - t_now;		// in microseconds
  t_delta    = min(t_ipi, t_gran) / 2;	// `delta' parameter in microseconds

  if (t_distance >= t_delta) {
	reschedule after (t_distance / 1000) milliseconds;
  } else {
  	t_ipi  = s / X;			// inter-packet interval in usec
	t_nom += t_ipi;			// compute the next send time
	send packet now;
  }


1) Description of the bug
-------------------------
Rescheduling requires a conversion into milliseconds, due to this call chain:

 * ccid3_hc_tx_send_packet() returns a timeout in milliseconds,
 * this value is converted by msecs_to_jiffies() in dccp_write_xmit(),
 * and finally used as jiffy-expires-value for sk_reset_timer().

The highest jiffy resolution with HZ=1000 is 1 millisecond, so using a higher
granularity does not make much sense here.

As a consequence, values of t_distance < 1000 are truncated to 0. This issue
has so far been resolved by using instead

  if (t_distance >= t_delta + 1000)
	reschedule after (t_distance / 1000) milliseconds;

The bug is in artificially inflating t_delta to t_delta' = t_delta + 1000. This
is unnecessarily large, a more adequate value is t_delta' = max(t_delta, 1000).


2) Consequences of using the corrected t_delta'
-----------------------------------------------
Since t_delta <= t_gran/2 = 10^6/(2*HZ), we have t_delta <= 1000 as long as
HZ >= 500. This means that t_delta' = max(1000, t_delta) is constant at 1000.

On the other hand, when using a coarse HZ value of HZ < 500, we have three
sub-cases that can all be reduced to using another constant of t_gran/2.

 (a) The first case arises when t_ipi > t_gran. Here t_delta' is the constant
     t_delta' = max(1000, t_gran/2) = t_gran/2.

 (b) If t_ipi <= 2000 < t_gran = 10^6/HZ usec, then t_delta = t_ipi/2 <= 1000,
     so that t_delta' = max(1000, t_delta) = 1000 < t_gran/2.

 (c) If 2000 < t_ipi <= t_gran, we have t_delta' = max(t_delta, 1000) = t_ipi/2.

In the second and third cases we have delay values less than t_gran/2, which is
in the order of less than or equal to half a jiffy.

How these are treated depends on how fractions of a jiffy are handled: they
are either always rounded down to 0, or always rounded up to 1 jiffy (assuming
non-zero values). In both cases the error is on average in the order of 50%.

Thus we are not increasing the error when in the second/third case we replace
a value less than t_gran/2 with 0, by setting t_delta' to the constant t_gran/2.


3) Summary
----------
Fixing (1) and considering (2), the patch replaces t_delta with a constant,
whose value depends on CONFIG_HZ, changing the above algorithm to:

  if (t_distance >= t_delta')
	reschedule after (t_distance / 1000) milliseconds;

where t_delta' = 10^6/(2*HZ) if HZ < 500, and t_delta' = 1000 otherwise.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 17 +++++++----------
 net/dccp/ccids/ccid3.h | 19 ++++++++++++++-----
 2 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index b4cc62e2e2b..eb1bda08eeb 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -93,19 +93,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
 	return scaled_div(w_init << 6, hctx->rtt);
 }
 
-/*
- * Recalculate t_ipi and delta (should be called whenever X changes)
+/**
+ * ccid3_update_send_interval  -  Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
  */
 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
 {
-	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
 	hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
 
-	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
-	hctx->delta = min_t(u32, hctx->t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN);
-
-	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hctx->t_ipi,
-		       hctx->delta, hctx->s, (unsigned)(hctx->x >> 6));
+	ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hctx->t_ipi,
+		       hctx->s, (unsigned)(hctx->x >> 6));
 }
 
 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
@@ -340,8 +337,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 * else
 		 *       // send the packet in (t_nom - t_now) milliseconds.
 		 */
-		if (delay - (s64)hctx->delta >= 1000)
-			return (u32)delay / 1000L;
+		if (delay >= TFRC_T_DELTA)
+			return (u32)delay / USEC_PER_MSEC;
 
 		ccid3_hc_tx_update_win_count(hctx, now);
 		break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 0cfcfff8f5f..92a5e1688a0 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,12 +47,23 @@
 /* Two seconds as per RFC 3448 4.2 */
 #define TFRC_INITIAL_TIMEOUT	   (2 * USEC_PER_SEC)
 
-/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
-#define TFRC_OPSYS_HALF_TIME_GRAN  (USEC_PER_SEC / (2 * HZ))
-
 /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
 #define TFRC_T_MBI		   64
 
+/*
+ * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta  =  t_gran / 2  =  %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA		   USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA		   (USEC_PER_SEC / (2 * HZ))
+#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
+#endif
+
 enum ccid3_options {
 	TFRC_OPT_LOSS_EVENT_RATE = 192,
 	TFRC_OPT_LOSS_INTERVALS	 = 193,
@@ -92,7 +103,6 @@ enum ccid3_hc_tx_states {
  * @no_feedback_timer - Handle to no feedback timer
  * @t_ld - Time last doubled during slow start
  * @t_nom - Nominal send time of next packet
- * @delta - Send timer delta (RFC 3448, 4.6) in usecs
  * @hist - Packet history
  * @options_received - Parsed set of retrieved options
  */
@@ -111,7 +121,6 @@ struct ccid3_hc_tx_sock {
 	struct timer_list		no_feedback_timer;
 	ktime_t				t_ld;
 	ktime_t				t_nom;
-	u32				delta;
 	struct tfrc_tx_hist_entry	*hist;
 	struct ccid3_options_received	options_received;
 };
-- 
cgit v1.2.3


From 63b3a73bb85daf441f964aaf9b3fc89be4209c23 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Remove ugly RTT-sampling history lookup

This removes the RTT-sampling function tfrc_tx_hist_rtt(), since

 1. it suffered from complex passing of return values (the return value both
    indicated successful lookup while the value doubled as RTT sample);

 2. when for some odd reason the sample value equalled 0, this triggered a bug
    warning about "bogus Ack", due to the ambiguity of the return value;

 3. on a passive host which has not sent anything the TX history is empty and
    thus will lead to unwanted "bogus Ack" warnings such as
    ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-28197148
    ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-26641606.

The fix is to replace the implicit encoding by performing the steps manually.

Furthermore, the "bogus Ack" warning has been removed, since it can actually be
triggered due to several reasons (network reordering, old packet, (3) above),
hence it is not very useful.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              | 34 +++++++++++++++++--------------
 net/dccp/ccids/lib/packet_history.c | 40 -------------------------------------
 net/dccp/ccids/lib/packet_history.h | 22 +++++++++++++++++---
 3 files changed, 38 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index eb1bda08eeb..f74e58d9793 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -371,6 +371,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 	struct ccid3_options_received *opt_recv = &hctx->options_received;
+	struct tfrc_tx_hist_entry *acked;
 	ktime_t now;
 	unsigned long t_nfb;
 	u32 pinv, r_sample;
@@ -384,17 +385,24 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	    hctx->state != TFRC_SSTATE_NO_FBACK)
 		return;
 
-	now = ktime_get_real();
-
-	/* Estimate RTT from history if ACK number is valid */
-	r_sample = tfrc_tx_hist_rtt(hctx->hist,
-				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
-	if (r_sample == 0) {
-		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
-			  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
-			  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
+	/*
+	 * Locate the acknowledged packet in the TX history.
+	 *
+	 * Returning "entry not found" here can for instance happen when
+	 *  - the host has not sent out anything (e.g. a passive server),
+	 *  - the Ack is outdated (packet with higher Ack number was received),
+	 *  - it is a bogus Ack (for a packet not sent on this connection).
+	 */
+	acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb));
+	if (acked == NULL)
 		return;
-	}
+	/* For the sake of RTT sampling, ignore/remove all older entries */
+	tfrc_tx_hist_purge(&acked->next);
+
+	/* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+	now	  = ktime_get_real();
+	r_sample  = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
+	hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
 
 	/* Update receive rate in units of 64 * bytes/second */
 	hctx->x_recv = opt_recv->ccid3or_receive_rate;
@@ -406,11 +414,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		hctx->p = 0;
 	else				       /* can not exceed 100% */
 		hctx->p = scaled_div(1, pinv);
-	/*
-	 * Validate new RTT sample and update moving average
-	 */
-	r_sample = dccp_sample_rtt(sk, r_sample);
-	hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
+
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
 	 */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 6cc108afdc3..5c445086690 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -40,18 +40,6 @@
 #include "packet_history.h"
 #include "../../dccp.h"
 
-/**
- *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
- *  @next:  next oldest entry (LIFO order)
- *  @seqno: sequence number of this entry
- *  @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
-	struct tfrc_tx_hist_entry *next;
-	u64			  seqno;
-	ktime_t			  stamp;
-};
-
 /*
  * Transmitter History Routines
  */
@@ -73,15 +61,6 @@ void tfrc_tx_packet_history_exit(void)
 	}
 }
 
-static struct tfrc_tx_hist_entry *
-	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
-	while (head != NULL && head->seqno != seqno)
-		head = head->next;
-
-	return head;
-}
-
 int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
 {
 	struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -111,25 +90,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
 }
 EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
 
-u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
-		     const ktime_t now)
-{
-	u32 rtt = 0;
-	struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
-
-	if (packet != NULL) {
-		rtt = ktime_us_delta(now, packet->stamp);
-		/*
-		 * Garbage-collect older (irrelevant) entries:
-		 */
-		tfrc_tx_hist_purge(&packet->next);
-	}
-
-	return rtt;
-}
-EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
-
-
 /*
  * 	Receiver History Routines
  */
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 461cc91cce8..221d8102da5 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
 #include <linux/slab.h>
 #include "tfrc.h"
 
-struct tfrc_tx_hist_entry;
+/**
+ *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
+ *  @next:  next oldest entry (LIFO order)
+ *  @seqno: sequence number of this entry
+ *  @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+	struct tfrc_tx_hist_entry *next;
+	u64			  seqno;
+	ktime_t			  stamp;
+};
+
+static inline struct tfrc_tx_hist_entry *
+	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+	while (head != NULL && head->seqno != seqno)
+		head = head->next;
+	return head;
+}
 
 extern int  tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
 extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-extern u32  tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
-			     const u64 seqno, const ktime_t now);
 
 /* Subtraction a-b modulo-16, respects circular wrap-around */
 #define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
-- 
cgit v1.2.3


From 47a61e7b433a014296971ea1226eb1adb6310ab4 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Simplify and consolidate tx_parse_options

This simplifies and consolidates the TX option-parsing code:

 1. The Loss Intervals option is not currently used, so dead code related to
    this option is removed. I am aware of no plans to support the option, but
    if someone wants to implement it (e.g. for inter-op tests), it is better
    to start afresh than having to also update currently unused code.

 2. The Loss Event and Receive Rate options have a lot of code in common (both
    are 32 bit, both have same length etc.), so this is consolidated.

 3. The test against GSR is not necessary, because
    - on first loading CCID3, ccid_new() zeroes out all fields in the socket;
    - ccid3_hc_tx_packet_recv() treats 0 and ~0U equivalently, due to

	pinv = opt_recv->ccid3or_loss_event_rate;
	if (pinv == ~0U || pinv == 0)
		hctx->p = 0;

    - as a result, the sequence number field is removed from opt_recv.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 57 +++++++++++++-------------------------------------
 net/dccp/ccids/ccid3.h |  3 ---
 2 files changed, 14 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index f74e58d9793..12b601f11bf 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -487,60 +487,31 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
 				     unsigned char len, u16 idx,
 				     unsigned char *value)
 {
-	int rc = 0;
-	const struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 	struct ccid3_options_received *opt_recv = &hctx->options_received;
 	__be32 opt_val;
 
-	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
-		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
-		opt_recv->ccid3or_loss_event_rate    = ~0;
-		opt_recv->ccid3or_loss_intervals_idx = 0;
-		opt_recv->ccid3or_loss_intervals_len = 0;
-		opt_recv->ccid3or_receive_rate	     = 0;
-	}
-
 	switch (option) {
+	case TFRC_OPT_RECEIVE_RATE:
 	case TFRC_OPT_LOSS_EVENT_RATE:
 		if (unlikely(len != 4)) {
-			DCCP_WARN("%s(%p), invalid len %d "
-				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
-				  dccp_role(sk), sk, len);
-			rc = -EINVAL;
-		} else {
-			opt_val = get_unaligned((__be32 *)value);
-			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
-			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
-				       dccp_role(sk), sk,
-				       opt_recv->ccid3or_loss_event_rate);
+			DCCP_WARN("%s(%p), invalid len %d for %u\n",
+				  dccp_role(sk), sk, len, option);
+			return -EINVAL;
 		}
-		break;
-	case TFRC_OPT_LOSS_INTERVALS:
-		opt_recv->ccid3or_loss_intervals_idx = idx;
-		opt_recv->ccid3or_loss_intervals_len = len;
-		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
-			       dccp_role(sk), sk,
-			       opt_recv->ccid3or_loss_intervals_idx,
-			       opt_recv->ccid3or_loss_intervals_len);
-		break;
-	case TFRC_OPT_RECEIVE_RATE:
-		if (unlikely(len != 4)) {
-			DCCP_WARN("%s(%p), invalid len %d "
-				  "for TFRC_OPT_RECEIVE_RATE\n",
-				  dccp_role(sk), sk, len);
-			rc = -EINVAL;
-		} else {
-			opt_val = get_unaligned((__be32 *)value);
-			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+		opt_val = ntohl(get_unaligned((__be32 *)value));
+
+		if (option == TFRC_OPT_RECEIVE_RATE) {
+			opt_recv->ccid3or_receive_rate = opt_val;
 			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
-				       dccp_role(sk), sk,
-				       opt_recv->ccid3or_receive_rate);
+				       dccp_role(sk), sk, opt_val);
+		} else {
+			opt_recv->ccid3or_loss_event_rate = opt_val;
+			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+				       dccp_role(sk), sk, opt_val);
 		}
-		break;
 	}
-
-	return rc;
+	return 0;
 }
 
 static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 92a5e1688a0..2268785ae8e 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -71,9 +71,6 @@ enum ccid3_options {
 };
 
 struct ccid3_options_received {
-	u64 ccid3or_seqno:48,
-	    ccid3or_loss_intervals_idx:16;
-	u16 ccid3or_loss_intervals_len;
 	u32 ccid3or_loss_event_rate;
 	u32 ccid3or_receive_rate;
 };
-- 
cgit v1.2.3


From 3306c781ff13aea89606435c134ec84e3c608681 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Add packet type information to CCID-specific option parsing

This patch ...
 1. adds packet type information to ccid_hc_{rx,tx}_parse_options(). This is
    necessary, since table 3 in RFC 4340, 5.8 leaves it to the CCIDs to state
    which options may (not) appear on what packet type.

 2. adds such a check for CCID-3's {Loss Event, Receive} Rate as specified in
    RFC 4340 8.3 ("Receive Rate options MUST NOT be sent on DCCP-Data packets")
    and 8.5 ("Loss Event Rate options MUST NOT be sent on DCCP-Data packets").

 3. removes an unused argument `idx' from ccid_hc_{rx,tx}_parse_options(). This
    is also no longer necessary, since the CCID-specific option-parsing routines
    are passed every single parameter of the type-length-value option encoding.

Also added documentation and made argument naming scheme consistent.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccid.h        | 46 +++++++++++++++++++++++-----------------------
 net/dccp/ccids/ccid3.c | 14 ++++++++------
 net/dccp/options.c     | 16 ++++------------
 3 files changed, 35 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 20ba066b277..bce517c9999 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -60,18 +60,14 @@ struct ccid_operations {
 	void		(*ccid_hc_tx_exit)(struct sock *sk);
 	void		(*ccid_hc_rx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_rx_parse_options)(struct sock *sk,
-						    unsigned char option,
-						    unsigned char len, u16 idx,
-						    unsigned char* value);
+	int		(*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+						    u8 opt, u8 *val, u8 len);
 	int		(*ccid_hc_rx_insert_options)(struct sock *sk,
 						     struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_tx_parse_options)(struct sock *sk,
-						    unsigned char option,
-						    unsigned char len, u16 idx,
-						    unsigned char* value);
+	int		(*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+						    u8 opt, u8 *val, u8 len);
 	int		(*ccid_hc_tx_send_packet)(struct sock *sk,
 						  struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_sent)(struct sock *sk,
@@ -163,27 +159,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
 		ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
 }
 
+/**
+ * ccid_hc_tx_parse_options  -  Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
-					   unsigned char option,
-					   unsigned char len, u16 idx,
-					   unsigned char* value)
+					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	int rc = 0;
-	if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
-		rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
-						    value);
-	return rc;
+	if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+		return 0;
+	return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
 }
 
+/**
+ * ccid_hc_rx_parse_options  -  Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
-					   unsigned char option,
-					   unsigned char len, u16 idx,
-					   unsigned char* value)
+					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	int rc = 0;
-	if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
-		rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
-	return rc;
+	if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+		return 0;
+	return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
 }
 
 static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 12b601f11bf..4c422fb2189 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -483,9 +483,8 @@ done_computing_x:
 			   jiffies + usecs_to_jiffies(t_nfb));
 }
 
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
-				     unsigned char len, u16 idx,
-				     unsigned char *value)
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+				     u8 option, u8 *optval, u8 optlen)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 	struct ccid3_options_received *opt_recv = &hctx->options_received;
@@ -494,12 +493,15 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
 	switch (option) {
 	case TFRC_OPT_RECEIVE_RATE:
 	case TFRC_OPT_LOSS_EVENT_RATE:
-		if (unlikely(len != 4)) {
+		/* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+		if (packet_type == DCCP_PKT_DATA)
+			break;
+		if (unlikely(optlen != 4)) {
 			DCCP_WARN("%s(%p), invalid len %d for %u\n",
-				  dccp_role(sk), sk, len, option);
+				  dccp_role(sk), sk, optlen, option);
 			return -EINVAL;
 		}
-		opt_val = ntohl(get_unaligned((__be32 *)value));
+		opt_val = ntohl(get_unaligned((__be32 *)optval));
 
 		if (option == TFRC_OPT_RECEIVE_RATE) {
 			opt_recv->ccid3or_receive_rate = opt_val;
diff --git a/net/dccp/options.c b/net/dccp/options.c
index fd51cc70c63..b102774694c 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -226,23 +226,15 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
 				      dccp_role(sk), elapsed_time);
 			break;
-		case 128 ... 191: {
-			const u16 idx = value - options;
-
+		case 128 ... 191:
 			if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
-						     opt, len, idx,
-						     value) != 0)
+						     pkt_type, opt, value, len))
 				goto out_invalid_option;
-		}
 			break;
-		case 192 ... 255: {
-			const u16 idx = value - options;
-
+		case 192 ... 255:
 			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
-						     opt, len, idx,
-						     value) != 0)
+						     pkt_type, opt, value, len))
 				goto out_invalid_option;
-		}
 			break;
 		default:
 			DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
-- 
cgit v1.2.3


From 535c55df136ad2783d444e54d518a8fae8bdbf79 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp tfrc/ccid-3: Computing Loss Rate from Loss Event Rate

This adds a function to take care of the following cases occurring in the
computation of the Loss Rate p:

 * 1/(2^32-1) is mapped into 0% as per RFC 4342, 8.5;
 * 1/0        is mapped into the maximum of 100%;
 * we want to avoid that p = 1/x is rounded down to 0 when x is very large,
   since this means accidentally re-entering slow-start (indicated by p==0).

In the last case, the minimum-resolution value of p is returned.

Furthermore, a bug in ccid3_hc_rx_getsockopt is fixed (1/0 was mapped into ~0U),
which now allows to consistently print the scaled p-values as

        printf("Loss Event Rate = %u.%04u %%\n", rx_info.tfrcrx_p / 10000,
                                                 rx_info.tfrcrx_p % 10000);

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c             |  9 ++++-----
 net/dccp/ccids/lib/tfrc.h          |  1 +
 net/dccp/ccids/lib/tfrc_equation.c | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 4c422fb2189..206204551f4 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -410,10 +410,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
 	/* Update loss event rate (which is scaled by 1e6) */
 	pinv = opt_recv->ccid3or_loss_event_rate;
-	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
+	if (pinv == 0)
 		hctx->p = 0;
-	else				       /* can not exceed 100% */
-		hctx->p = scaled_div(1, pinv);
+	else
+		hctx->p = tfrc_invert_loss_event_rate(pinv);
 
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
@@ -854,8 +854,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
 			return -EINVAL;
 		rx_info.tfrcrx_x_recv = hcrx->x_recv;
 		rx_info.tfrcrx_rtt    = hcrx->rtt;
-		rx_info.tfrcrx_p      = hcrx->p_inverse == 0 ? ~0U :
-					   scaled_div(1, hcrx->p_inverse);
+		rx_info.tfrcrx_p      = tfrc_invert_loss_event_rate(hcrx->p_inverse);
 		len = sizeof(rx_info);
 		val = &rx_info;
 		break;
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ed9857527ac..bb47146ac7d 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -58,6 +58,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
 
 extern u32  tfrc_calc_x(u16 s, u32 R, u32 p);
 extern u32  tfrc_calc_x_reverse_lookup(u32 fvalue);
+extern u32  tfrc_invert_loss_event_rate(u32 loss_event_rate);
 
 extern int  tfrc_tx_packet_history_init(void);
 extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 2f20a29cffe..bc3dc2b2a84 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -658,7 +658,6 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 	result = scaled_div(s, R);
 	return scaled_div32(result, f);
 }
-
 EXPORT_SYMBOL_GPL(tfrc_calc_x);
 
 /**
@@ -693,5 +692,19 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 	index = tfrc_binsearch(fvalue, 0);
 	return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
 }
-
 EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
+
+/**
+ * tfrc_invert_loss_event_rate  -  Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+	if (loss_event_rate == UINT_MAX)		/* see RFC 4342, 8.5 */
+		return 0;
+	if (unlikely(loss_event_rate == 0))		/* map 1/0 into 100% */
+		return 1000000;
+	return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
+EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);
-- 
cgit v1.2.3


From ce177ae2e6b196659e93a9408cc1f5f13f206d13 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Remove redundant 'options_received' struct

The `options_received' struct is redundant, since it re-duplicates the existing
`p' and `x_recv' fields. This patch removes the sub-struct and migrates the
format conversion operations (cf. below) to ccid3_hc_tx_parse_options().

                     Why the fields are redundant
                     ----------------------------
The Loss Event Rate p and the Receive Rate x_recv are initially 0 when first
loading CCID-3, as ccid_new() zeroes out the entire ccid3_hc_tx_sock.

When Loss Event Rate or Receive Rate options are received, they are stored by
ccid3_hc_tx_parse_options() into the fields `ccid3or_loss_event_rate' and
`ccid3or_receive_rate' of the sub-struct `options_received' in ccid3_hc_tx_sock.

After parsing (considering only the established state - dccp_rcv_established()),
the packet is passed on to ccid_hc_tx_packet_recv(). This calls the CCID-3
specific routine ccid3_hc_tx_packet_recv(), which performs the following copy
operations between fields of ccid3_hc_tx_sock:

 * hctx->options_received.ccid3or_receive_rate is copied into hctx->x_recv,
   after scaling it for fixpoint arithmetic, by 2^64;
 * hctx->options_received.ccid3or_loss_event_rate is copied into hctx->p,
   considering the above special cases; in addition, a value of 0 here needs to
   be mapped into p=0 (when no Loss Event Rate option has been received yet).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 24 ++++++++----------------
 net/dccp/ccids/ccid3.h |  7 -------
 2 files changed, 8 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 206204551f4..e7db8a4ee64 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -370,11 +370,10 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct ccid3_options_received *opt_recv = &hctx->options_received;
 	struct tfrc_tx_hist_entry *acked;
 	ktime_t now;
 	unsigned long t_nfb;
-	u32 pinv, r_sample;
+	u32 r_sample;
 
 	/* we are only interested in ACKs */
 	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
@@ -404,17 +403,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	r_sample  = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
 	hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
 
-	/* Update receive rate in units of 64 * bytes/second */
-	hctx->x_recv = opt_recv->ccid3or_receive_rate;
-	hctx->x_recv <<= 6;
-
-	/* Update loss event rate (which is scaled by 1e6) */
-	pinv = opt_recv->ccid3or_loss_event_rate;
-	if (pinv == 0)
-		hctx->p = 0;
-	else
-		hctx->p = tfrc_invert_loss_event_rate(pinv);
-
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
 	 */
@@ -487,7 +475,6 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
 				     u8 option, u8 *optval, u8 optlen)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct ccid3_options_received *opt_recv = &hctx->options_received;
 	__be32 opt_val;
 
 	switch (option) {
@@ -504,11 +491,16 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
 		opt_val = ntohl(get_unaligned((__be32 *)optval));
 
 		if (option == TFRC_OPT_RECEIVE_RATE) {
-			opt_recv->ccid3or_receive_rate = opt_val;
+			/* Receive Rate is kept in units of 64 bytes/second */
+			hctx->x_recv = opt_val;
+			hctx->x_recv <<= 6;
+
 			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
 				       dccp_role(sk), sk, opt_val);
 		} else {
-			opt_recv->ccid3or_loss_event_rate = opt_val;
+			/* Update the fixpoint Loss Event Rate fraction */
+			hctx->p = tfrc_invert_loss_event_rate(opt_val);
+
 			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
 				       dccp_role(sk), sk, opt_val);
 		}
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 2268785ae8e..ae210786c89 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -70,11 +70,6 @@ enum ccid3_options {
 	TFRC_OPT_RECEIVE_RATE	 = 194,
 };
 
-struct ccid3_options_received {
-	u32 ccid3or_loss_event_rate;
-	u32 ccid3or_receive_rate;
-};
-
 /* TFRC sender states */
 enum ccid3_hc_tx_states {
 	TFRC_SSTATE_NO_SENT = 1,
@@ -101,7 +96,6 @@ enum ccid3_hc_tx_states {
  * @t_ld - Time last doubled during slow start
  * @t_nom - Nominal send time of next packet
  * @hist - Packet history
- * @options_received - Parsed set of retrieved options
  */
 struct ccid3_hc_tx_sock {
 	u64				x;
@@ -119,7 +113,6 @@ struct ccid3_hc_tx_sock {
 	ktime_t				t_ld;
 	ktime_t				t_nom;
 	struct tfrc_tx_hist_entry	*hist;
-	struct ccid3_options_received	options_received;
 };
 
 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
-- 
cgit v1.2.3


From f10ecaee6dc2c6d56783462b2a82e98bc81b55f4 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Replace magic CCID-specific numbers by symbolic constants

The constants DCCPO_{MIN,MAX}_CCID_SPECIFIC are nowhere used in the code, but
instead for the CCID-specific options numbers are used.

This patch unifies the use of CCID-specific option numbers, by adding symbolic
names reflecting the definitions in RFC 4340, 10.3.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index b102774694c..e1c94e2b8be 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		}
 
 		/*
-		 * CCID-Specific Options (from RFC 4340, sec. 10.3):
-		 *
-		 * Option numbers 128 through 191 are for options sent from the
-		 * HC-Sender to the HC-Receiver; option numbers 192 through 255
-		 * are for options sent from the HC-Receiver to	the HC-Sender.
-		 *
 		 * CCID-specific options are ignored during connection setup, as
 		 * negotiation may still be in progress (see RFC 4340, 10.3).
 		 * The same applies to Ack Vectors, as these depend on the CCID.
-		 *
 		 */
-		if (dreq != NULL && (opt >= 128 ||
+		if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
 		    opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
 			goto ignore_option;
 
@@ -226,12 +219,12 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
 				      dccp_role(sk), elapsed_time);
 			break;
-		case 128 ... 191:
+		case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
 			if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
 						     pkt_type, opt, value, len))
 				goto out_invalid_option;
 			break;
-		case 192 ... 255:
+		case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
 			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
 						     pkt_type, opt, value, len))
 				goto out_invalid_option;
-- 
cgit v1.2.3


From c506d91d9ab7681e058afcd750e9118c6cdaabc1 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Unused argument in CCID tx function

This removes the argument `more' from ccid_hc_tx_packet_sent, since it was
nowhere used in the entire code.

(Anecdotally, this argument was not even used in the original KAME code where
 the function originally came from; compare the variable moreToSend in the
 freebsd61-dccp-kame-28.08.2006.patch now maintained by Emmanuel Lochin.)

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccid.h        | 6 +++---
 net/dccp/ccids/ccid2.c | 2 +-
 net/dccp/ccids/ccid3.c | 3 +--
 net/dccp/output.c      | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index bce517c9999..430a3c24f97 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -71,7 +71,7 @@ struct ccid_operations {
 	int		(*ccid_hc_tx_send_packet)(struct sock *sk,
 						  struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_sent)(struct sock *sk,
-						  int more, unsigned int len);
+						  unsigned int len);
 	void		(*ccid_hc_rx_get_info)(struct sock *sk,
 					       struct tcp_info *info);
 	void		(*ccid_hc_tx_get_info)(struct sock *sk,
@@ -139,10 +139,10 @@ static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
 }
 
 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
-					  int more, unsigned int len)
+					  unsigned int len)
 {
 	if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
-		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
+		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
 }
 
 static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9728bbf0ace..f56ab68a4b7 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -221,7 +221,7 @@ static void ccid2_start_rto_timer(struct sock *sk)
 		       jiffies + hctx->rto);
 }
 
-static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index e7db8a4ee64..c1cc66edfad 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -356,8 +356,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
-				    unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 9888f61f9b0..be65bc30e20 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -301,7 +301,7 @@ void dccp_write_xmit(struct sock *sk, int block)
 				dcb->dccpd_type = DCCP_PKT_DATA;
 
 			err = dccp_transmit_skb(sk, skb);
-			ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+			ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
 			if (err)
 				DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
 					 err);
-- 
cgit v1.2.3


From 5fe94963a163fecc34b7b51bf2ca525f9f50d7bf Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Remove duplicate documentation

This removes RX-socket documentation which is either duplicate or non-existent.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index ae210786c89..de26db82d65 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -131,16 +131,12 @@ enum ccid3_hc_rx_states {
 
 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
  *
- *  @x_recv  -  Receiver estimate of send rate (RFC 3448 4.3)
- *  @rtt  -  Receiver estimate of rtt (non-standard)
- *  @p  -  Current loss event rate (RFC 3448 5.4)
  *  @last_counter  -  Tracks window counter (RFC 4342, 8.1)
  *  @state  -  Receiver state, one of %ccid3_hc_rx_states
  *  @bytes_recv  -  Total sum of DCCP payload bytes
  *  @x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
  *  @rtt  -  Receiver estimate of RTT
  *  @tstamp_last_feedback  -  Time at which last feedback was sent
- *  @tstamp_last_ack  -  Time at which last feedback was sent
  *  @hist  -  Packet history (loss detection + RTT sampling)
  *  @li_hist  -  Loss Interval database
  *  @s  -  Received packet size in bytes
-- 
cgit v1.2.3


From d0995e6a9e3328cdc76b4c45882dee118284f960 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Remove dead states

This patch is thanks to an investigation by Leandro Sales de Melo and his
colleagues. They worked out two state diagrams which highlight the fact that
the xxx_TERM states in CCID-3/4 are in fact not necessary.

And this can be confirmed by in turn looking at the code: the xxx_TERM states
are only ever set in ccid3_hc_{rx,tx}_exit(). These two functions are part
of the following call chain:

 * ccid_hc_{tx,rx}_exit() are called from ccid_delete() only;
 * ccid_delete() invokes ccid_hc_{tx,rx}_exit() in the way of a destructor:
   after calling ccid_hc_{tx,rx}_exit(), the CCID is released from memory;
 * ccid_delete() is in turn called only by ccid_hc_{tx,rx}_delete();
 * ccid_hc_{tx,rx}_delete() is called only if
   - feature negotiation failed   (dccp_feat_activate_values()),
   - when changing the RX/TX CCID (to eject the current CCID),
   - when destroying the socket   (in dccp_destroy_sock()).

In other words, when CCID-3 sets the state to xxx_TERM, it is at a time where
no more processing should be going on, hence it is not necessary to introduce
a dedicated exit state - this is implicit when unloading the CCID.

The patch removes this state, one switch-statement collapses as a result.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 37 +++++++++----------------------------
 net/dccp/ccids/ccid3.h |  2 --
 2 files changed, 9 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index c1cc66edfad..0751a8fa936 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -56,7 +56,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
 	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
 	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
 	[TFRC_SSTATE_FBACK]    = "FBACK",
-	[TFRC_SSTATE_TERM]     = "TERM",
 	};
 
 	return ccid3_state_names[state];
@@ -210,10 +209,13 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
 		       ccid3_tx_state_name(hctx->state));
 
+	/* Ignore and do not restart after leaving the established state */
+	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+		goto out;
+
+	/* Reset feedback state to "no feedback received" */
 	if (hctx->state == TFRC_SSTATE_FBACK)
 		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-	else if (hctx->state != TFRC_SSTATE_NO_FBACK)
-		goto out;
 
 	/*
 	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
@@ -288,8 +290,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(skb->len == 0))
 		return -EBADMSG;
 
-	switch (hctx->state) {
-	case TFRC_SSTATE_NO_SENT:
+	if (hctx->state == TFRC_SSTATE_NO_SENT) {
 		sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
 				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
 		hctx->last_win_count   = 0;
@@ -324,9 +325,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		ccid3_update_send_interval(hctx);
 
 		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-		break;
-	case TFRC_SSTATE_NO_FBACK:
-	case TFRC_SSTATE_FBACK:
+
+	} else {
 		delay = ktime_us_delta(hctx->t_nom, now);
 		ccid3_pr_debug("delay=%ld\n", (long)delay);
 		/*
@@ -341,10 +341,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 			return (u32)delay / USEC_PER_MSEC;
 
 		ccid3_hc_tx_update_win_count(hctx, now);
-		break;
-	case TFRC_SSTATE_TERM:
-		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
-		return -EINVAL;
 	}
 
 	/* prepare to send now (add options etc.) */
@@ -378,11 +374,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
 	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
 		return;
-	/* ... and only in the established state */
-	if (hctx->state != TFRC_SSTATE_FBACK &&
-	    hctx->state != TFRC_SSTATE_NO_FBACK)
-		return;
-
 	/*
 	 * Locate the acknowledged packet in the TX history.
 	 *
@@ -522,9 +513,7 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
-	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
 	sk_stop_timer(sk, &hctx->no_feedback_timer);
-
 	tfrc_tx_hist_purge(&hctx->hist);
 }
 
@@ -583,7 +572,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
 	static char *ccid3_rx_state_names[] = {
 	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
 	[TFRC_RSTATE_DATA]    = "DATA",
-	[TFRC_RSTATE_TERM]    = "TERM",
 	};
 
 	return ccid3_rx_state_names[state];
@@ -609,14 +597,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
-	ktime_t now;
+	ktime_t now = ktime_get_real();
 	s64 delta = 0;
 
-	if (unlikely(hcrx->state == TFRC_RSTATE_TERM))
-		return;
-
-	now = ktime_get_real();
-
 	switch (fbtype) {
 	case CCID3_FBACK_INITIAL:
 		hcrx->x_recv = 0;
@@ -819,8 +802,6 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 
-	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
 	tfrc_rx_hist_purge(&hcrx->hist);
 	tfrc_lh_cleanup(&hcrx->li_hist);
 }
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index de26db82d65..7884159937a 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -75,7 +75,6 @@ enum ccid3_hc_tx_states {
 	TFRC_SSTATE_NO_SENT = 1,
 	TFRC_SSTATE_NO_FBACK,
 	TFRC_SSTATE_FBACK,
-	TFRC_SSTATE_TERM,
 };
 
 /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
@@ -126,7 +125,6 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
 enum ccid3_hc_rx_states {
 	TFRC_RSTATE_NO_DATA = 1,
 	TFRC_RSTATE_DATA,
-	TFRC_RSTATE_TERM    = 127,
 };
 
 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
-- 
cgit v1.2.3


From 2975abd251d795810932b20354729ba236d95bf9 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Schedule an Ack when receiving timestamps

This schedules an Ack when receiving a timestamp, exploiting the
existing inet_csk_schedule_ack() function, saving one case in the
`dccp_ack_pending()' function.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/dccp.h    | 2 +-
 net/dccp/options.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index c7370de4c4d..f9ed0cbd1bf 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -431,7 +431,7 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
 static inline int dccp_ack_pending(const struct sock *sk)
 {
 	const struct dccp_sock *dp = dccp_sk(sk);
-	return dp->dccps_timestamp_echo != 0 ||
+	return
 #ifdef CONFIG_IP_DCCP_ACKVEC
 	       (dp->dccps_hc_rx_ackvec != NULL &&
 		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
diff --git a/net/dccp/options.c b/net/dccp/options.c
index e1c94e2b8be..9fe0510f402 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -163,6 +163,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 				      dccp_role(sk), ntohl(opt_val),
 				      (unsigned long long)
 				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+			/* schedule an Ack in case this sender is quiescent */
+			inet_csk_schedule_ack(sk);
 			break;
 		case DCCPO_TIMESTAMP_ECHO:
 			if (len != 4 && len != 6 && len != 8)
-- 
cgit v1.2.3


From bfbddd085a5bced6efb9e1bc4d029438f9639784 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Fix the adjustments to AWL and SWL

This fixes a problem and a potential loophole with regard to seqno/ackno
validity: the problem is that the initial adjustments to AWL/SWL were
only performed at the begin of the connection, during the handshake.

Since the Sequence Window feature is always greater than Wmin=32 (7.5.2),
it is however necessary to perform these adjustments at least for the first
W/W' (variables as per 7.5.1) packets in the lifetime of a connection.

This requirement is complicated by the fact that W/W' can change at any time
during the lifetime of a connection.

Therefore the consequence is to perform this safety check each time SWL/AWL
are updated.

A second problem solved by this patch is that the remote/local Sequence Window
feature values (which set the bounds for AWL/SWL/SWH) are undefined until the
feature negotiation has completed.

During the initial handshake we have more stringent sequence number protection,
the changes added by this patch effect that {A,S}W{L,H} are within the correct
bounds at the instant that feature negotiation completes (since the SeqWin
feature activation handlers call dccp_update_gsr/gss()).

A detailed rationale is below -- can be removed from the commit message.


1. Server sequence number checks during initial handshake
---------------------------------------------------------
The server can not use the fields of the listening socket for seqno/ackno checks
and thus needs to store all relevant information on a per-connection basis on
the dccp_request socket. This is a size-constrained structure and has currently
only ISS (dreq_iss) and ISR (dreq_isr) defined.
Adding further fields (SW{L,H}, AW{L,H}) would increase the size of the struct
and it is questionable whether this will have any practical gain. The currently
implemented solution is as follows.
 * receiving first Request: dccp_v{4,6}_conn_request sets
                            ISR := P.seqno, ISS := dccp_v{4,6}_init_sequence()

 * sending first Response:  dccp_v{4,6}_send_response via dccp_make_response()
                            sets P.seqno := ISS, sets P.ackno := ISR

 * receiving retransmitted Request: dccp_check_req() overrides ISR := P.seqno

 * answering retransmitted Request: dccp_make_response() sets ISS += 1,
                                    otherwise as per first Response

 * completing the handshake: succeeds in dccp_check_req() for the first Ack
                             where P.ackno == ISS (P.seqno is not tested)

 * creating child socket: ISS, ISR are copied from the request_sock

This solution will succeed whenever the server can receive the Request and the
subsequent Ack in succession, without retransmissions. If there is packet loss,
the client needs to retransmit until this condition succeeds; it will otherwise
eventually give up. Adding further fields to the request_sock could increase
the robustness a bit, in that it would make possible to let a reordered Ack
(from a retransmitted Response) pass. The argument against such a solution is
that if the packet loss is not persistent and an Ack gets through, why not
wait for the one answering the original response: if the loss is persistent, it
is probably better to not start the connection in the first place.

Long story short: the present design (by Arnaldo) is simple and will likely work
just as well as a more complicated solution. As a consequence, {A,S}W{L,H} are
not needed until the moment the request_sock is cloned into the accept queue.

At that stage feature negotiation has completed, so that the values for the local
and remote Sequence Window feature (7.5.2) are known, i.e. we are now in a better
position to compute {A,S}W{L,H}.


2. Client sequence number checks during initial handshake
---------------------------------------------------------
Until entering PARTOPEN the client does not need the adjustments, since it
constrains the Ack window to the packet it sent.

 * sending first Request: dccp_v{4,6}_connect() choose ISS,
                          dccp_connect() then sets GAR := ISS (as per 8.5),
			  dccp_transmit_skb() (with the previous bug fix) sets
			         GSS := ISS, AWL := ISS, AWH := GSS
 * n-th retransmitted Request (with previous patch):
	                  dccp_retransmit_skb() via timer calls
			  dccp_transmit_skb(), which sets GSS := ISS+n
                          and then AWL := ISS, AWH := ISS+n

 * receiving any Response: dccp_rcv_request_sent_state_process()
	                   -- accepts packet if AWL <= P.ackno <= AWH;
			   -- sets GSR = ISR = P.seqno

 * sending the Ack completing the handshake: dccp_send_ack() calls
                           dccp_transmit_skb(), which sets GSS += 1
			   and AWL := ISS, AWH := GSS


Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/dccp.h      | 20 ++++++++++++++++++++
 net/dccp/input.c     | 18 ++++++------------
 net/dccp/minisocks.c | 30 +++++++++---------------------
 3 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f9ed0cbd1bf..e4d6e76ced4 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -415,6 +415,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 	dp->dccps_gsr = seq;
 	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
 	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+	/*
+	 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+	 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+	 * always > 32, so for the first W/W' packets in the lifetime of a
+	 * connection we always have to adjust SWL.
+	 * A second reason why we are doing this is that the window depends on
+	 * the feature-remote value of Sequence Window: nothing stops the peer
+	 * from updating this value while we are busy adjusting SWL for the
+	 * first W packets (we would have to count from scratch again then).
+	 * Therefore it is safer to always make sure that the Sequence Window
+	 * is not artificially extended by a peer who grows SWL downwards by
+	 * continually updating the feature-remote Sequence-Window.
+	 * If sequence numbers wrap it is bad luck. But that will take a while
+	 * (48 bit), and this measure prevents Sequence-number attacks.
+	 */
+	if (before48(dp->dccps_swl, dp->dccps_isr))
+		dp->dccps_swl = dp->dccps_isr;
 	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
 }
 
@@ -425,6 +442,9 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
 	dp->dccps_gss = seq;
 	/* Ack validity window depends on local Sequence Window value (7.5.1) */
 	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+	/* Adjust AWL so that it is not below ISS - see comment above for SWL */
+	if (before48(dp->dccps_awl, dp->dccps_iss))
+		dp->dccps_awl = dp->dccps_iss;
 	dp->dccps_awh = dp->dccps_gss;
 }
 
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 5eb443f656c..e3f43d55e3c 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -440,20 +440,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 
-		dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
-		dccp_update_gsr(sk, dp->dccps_isr);
 		/*
-		 * SWL and AWL are initially adjusted so that they are not less than
-		 * the initial Sequence Numbers received and sent, respectively:
-		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
-		 *	AWL := max(GSS - W' + 1, ISS).
-		 * These adjustments MUST be applied only at the beginning of the
-		 * connection.
-		 *
-		 * AWL was adjusted in dccp_v4_connect -acme
+		 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+		 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+		 * is done as part of activating the feature values below, since
+		 * these settings depend on the local/remote Sequence Window
+		 * features, which were undefined or not confirmed until now.
 		 */
-		dccp_set_seqno(&dp->dccps_swl,
-			       max48(dp->dccps_swl, dp->dccps_isr));
+		dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
 
 		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 0ecb19c5e8c..f4d9c8f60ed 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -120,30 +120,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		 *
 		 *    Choose S.ISS (initial seqno) or set from Init Cookies
 		 *    Initialize S.GAR := S.ISS
-		 *    Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
-		 */
-		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
-		dccp_update_gss(newsk, dreq->dreq_iss);
-
-		newdp->dccps_isr = dreq->dreq_isr;
-		dccp_update_gsr(newsk, dreq->dreq_isr);
-
-		/*
-		 * SWL and AWL are initially adjusted so that they are not less than
-		 * the initial Sequence Numbers received and sent, respectively:
-		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
-		 *	AWL := max(GSS - W' + 1, ISS).
-		 * These adjustments MUST be applied only at the beginning of the
-		 * connection.
+		 *    Set S.ISR, S.GSR from packet (or Init Cookies)
+		 *
+		 *    Setting AWL/AWH and SWL/SWH happens as part of the feature
+		 *    activation below, as these windows all depend on the local
+		 *    and remote Sequence Window feature values (7.5.2).
 		 */
-		dccp_set_seqno(&newdp->dccps_swl,
-			       max48(newdp->dccps_swl, newdp->dccps_isr));
-		dccp_set_seqno(&newdp->dccps_awl,
-			       max48(newdp->dccps_awl, newdp->dccps_iss));
+		newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
+		newdp->dccps_gar = newdp->dccps_iss;
+		newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
 
 		/*
-		 * Activate features after initialising the sequence numbers,
-		 * since CCID initialisation may depend on GSS, ISR, ISS etc.
+		 * Activate features: initialise CCIDs, sequence windows etc.
 		 */
 		if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
 			/* It is still raw copy of parent, so invalidate
-- 
cgit v1.2.3


From a9c1656ab10480cc6f6d34f193bcde2729fe8037 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Merge now-reduced connect_init() function

After moving the assignment of GAR/ISS from dccp_connect_init() to
dccp_transmit_skb(), the former function becomes very small, so that
a merger with dccp_connect() suggests itself.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/output.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/dccp/output.c b/net/dccp/output.c
index be65bc30e20..1b316830758 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -471,8 +471,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
 /*
  * Do all connect socket setups that can be done AF independent.
  */
-static inline void dccp_connect_init(struct sock *sk)
+int dccp_connect(struct sock *sk)
 {
+	struct sk_buff *skb;
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -482,22 +483,12 @@ static inline void dccp_connect_init(struct sock *sk)
 
 	dccp_sync_mss(sk, dst_mtu(dst));
 
-	/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
-	dp->dccps_gar = dp->dccps_iss;
-
-	icsk->icsk_retransmits = 0;
-}
-
-int dccp_connect(struct sock *sk)
-{
-	struct sk_buff *skb;
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
 	/* do not connect if feature negotiation setup fails */
 	if (dccp_feat_finalise_settings(dccp_sk(sk)))
 		return -EPROTO;
 
-	dccp_connect_init(sk);
+	/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
+	dp->dccps_gar = dp->dccps_iss;
 
 	skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
 	if (unlikely(skb == NULL))
@@ -513,6 +504,7 @@ int dccp_connect(struct sock *sk)
 	DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the REQUEST until an answer. */
+	icsk->icsk_retransmits = 0;
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 				  icsk->icsk_rto, DCCP_RTO_MAX);
 	return 0;
-- 
cgit v1.2.3


From b8c6bcee1dbc1aadcd67af998e414e73fa166a7d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Reduce noise in output and convert to ktime_t

This fixes the problem that dccp_probe output can grow quite large without
apparent benefit (many identical data points), creating huge files (up to
over one Gigabyte for a few minutes' test run) which are very hard to
post-process (in one instance it got so bad that gnuplot ate up all memory
plus swap).

The cause for the problem is that the kprobe is inserted into dccp_sendmsg(),
which can be called in a polling-mode (whenever the TX queue is full due to
congestion-control issues, EAGAIN is returned). This creates many very
similar data points, i.e. the increase of processing time does not increase
the quality/information of the probe output.

The fix is to attach the probe to a different function -- write_xmit was
chosen since it gets called continually (both via userspace and timer);
an input-path function would stop sampling as soon as the other end stops
sending feedback.

For comparison the output file sizes for the same 20 second test
run over a lossy link:
           * before / without patch:  118   Megabytes
           * after  / with patch:       1.2 Megabytes
and there was much less noise in the output.

To allow backward compatibility with scripts that people use, the now-unused
`size' field in the output has been replaced with the CCID identifier. This
also serves for future compatibility - support for CCID2 is work in progress
(depends on the still unfinished SRTT/RTTVAR updates).

While at it, the update to ktime_t was also performed.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 net/dccp/probe.c | 66 +++++++++++++++++++++-----------------------------------
 1 file changed, 25 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index a87fd4fc1b3..eaa59d82ab0 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -46,70 +46,54 @@ static struct {
 	struct kfifo	  *fifo;
 	spinlock_t	  lock;
 	wait_queue_head_t wait;
-	struct timespec	  tstart;
+	ktime_t		  start;
 } dccpw;
 
-static void printl(const char *fmt, ...)
-{
-	va_list args;
-	int len;
-	struct timespec now;
-	char tbuf[256];
-
-	va_start(args, fmt);
-	getnstimeofday(&now);
-
-	now = timespec_sub(now, dccpw.tstart);
-
-	len = sprintf(tbuf, "%lu.%06lu ",
-		      (unsigned long) now.tv_sec,
-		      (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
-	va_end(args);
-
-	kfifo_put(dccpw.fifo, tbuf, len);
-	wake_up(&dccpw.wait);
-}
-
-static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
-			 struct msghdr *msg, size_t size)
+static void jdccp_write_xmit(struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 	struct ccid3_hc_tx_sock *hctx = NULL;
+	struct timespec tv;
+	char buf[256];
+	int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk));
 
-	if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+	if (ccid == DCCPC_CCID3)
 		hctx = ccid3_hc_tx_sk(sk);
 
-	if (port == 0 || ntohs(inet->dport) == port ||
-	    ntohs(inet->sport) == port) {
-		if (hctx)
-			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u "
-			       "%llu %llu %d\n",
+	if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) {
+
+		tv  = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start));
+		len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d",
+			       (unsigned long)tv.tv_sec,
+			       (unsigned long)tv.tv_nsec,
 			       NIPQUAD(inet->saddr), ntohs(inet->sport),
-			       NIPQUAD(inet->daddr), ntohs(inet->dport), size,
+			       NIPQUAD(inet->daddr), ntohs(inet->dport), ccid);
+
+		if (hctx)
+			len += sprintf(buf + len, " %d %d %d %u %u %u %d",
 			       hctx->s, hctx->rtt, hctx->p, hctx->x_calc,
-			       hctx->x_recv >> 6, hctx->x >> 6, hctx->t_ipi);
-		else
-			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
-			       NIPQUAD(inet->saddr), ntohs(inet->sport),
-			       NIPQUAD(inet->daddr), ntohs(inet->dport), size);
+			       (unsigned)(hctx->x_recv >> 6),
+			       (unsigned)(hctx->x >> 6), hctx->t_ipi);
+
+		len += sprintf(buf + len, "\n");
+		kfifo_put(dccpw.fifo, buf, len);
+		wake_up(&dccpw.wait);
 	}
 
 	jprobe_return();
-	return 0;
 }
 
 static struct jprobe dccp_send_probe = {
 	.kp	= {
-		.symbol_name = "dccp_sendmsg",
+		.symbol_name = "dccp_write_xmit",
 	},
-	.entry	= jdccp_sendmsg,
+	.entry	= jdccp_write_xmit,
 };
 
 static int dccpprobe_open(struct inode *inode, struct file *file)
 {
 	kfifo_reset(dccpw.fifo);
-	getnstimeofday(&dccpw.tstart);
+	dccpw.start = ktime_get();
 	return 0;
 }
 
-- 
cgit v1.2.3


From ff49e27089ec363b7fc3849504e0435d447ab18a Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Ack Vector interface clean-up

This patch brings the Ack Vector interface up to date. Its main purpose is
to lay the basis for the subsequent patches of this set, which will use the
new data structure fields and routines.

There are no real algorithmic changes, rather an adaptation:

 (1) Replaced the static Ack Vector size (2) with a #define so that it can
     be adapted (with low loss / Ack Ratio, a value of 1 works, so 2 seems
     to be sufficient for the moment) and added a solution so that computing
     the ECN nonce will continue to work - even with larger Ack Vectors.

 (2) Replaced the #defines for Ack Vector states with a complete enum.

 (3) Replaced #defines to compute Ack Vector length and state with general
     purpose routines (inlines), and updated code to use these.

 (4) Added a `tail' field (conversion to circular buffer in subsequent patch).

 (5) Updated the (outdated) documentation for Ack Vector struct.

 (6) All sequence number containers now trimmed to 48 bits.

 (7) Removal of unused bits:
     * removed dccpav_ack_nonce from struct dccp_ackvec, since this is already
       redundantly stored in the `dccpavr_ack_nonce' (of Ack Vector record);
     * removed Elapsed Time for Ack Vectors (it was nowhere used);
     * replaced semantics of dccpavr_sent_len with dccpavr_ack_runlen, since
       the code needs to be able to remember the old run length;
     * reduced the de-/allocation routines (redundant / duplicate tests).


Justification for removing Elapsed Time information [can be removed]:
---------------------------------------------------------------------
 1. The Elapsed Time information for Ack Vectors was nowhere used in the code.
 2. DCCP does not implement rate-based pacing of acknowledgments. The only
    recommendation for always including Elapsed Time is in section 11.3 of
    RFC 4340: "Receivers that rate-pace acknowledgements SHOULD [...]
    include Elapsed Time options". But such is not the case here.
 3. It does not really improve estimation accuracy. The Elapsed Time field only
    records the time between the arrival of the last acknowledgeable packet and
    the time the Ack Vector is sent out. Since Linux does not (yet) implement
    delayed Acks, the time difference will typically be small, since often the
    arrival of a data packet triggers sending feedback at the HC-receiver.


Justification for changes in de-/allocation routines [can be removed]:
----------------------------------------------------------------------
  * INIT_LIST_HEAD in dccp_ackvec_record_new was redundant, since the list
    pointers were later overwritten when the node was added via list_add();
  * dccp_ackvec_record_new() was called in a single place only;
  * calls to list_del_init() before calling dccp_ackvec_record_delete() were
    redundant, since subsequently the entire element was k-freed;
  * since all calls to dccp_ackvec_record_delete() were preceded to a call to
    list_del_init(), the WARN_ON test would never evaluate to true;
  * since all calls to dccp_ackvec_record_delete() were made from within
    list_for_each_entry_safe(), the test for avr == NULL was redundant;
  * list_empty() in ackvec_free was redundant, since the same condition is
    embedded in the loop condition of the subsequent list_for_each_entry_safe().

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c      | 178 ++++++++++++++++++-------------------------------
 net/dccp/ackvec.h      | 103 +++++++++++++++-------------
 net/dccp/ccids/ccid2.c |  13 ++--
 net/dccp/input.c       |   6 +-
 4 files changed, 127 insertions(+), 173 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 01e4d39fa23..85ad70cfba5 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,7 +1,8 @@
 /*
  *  net/dccp/ackvec.c
  *
- *  An implementation of the DCCP protocol
+ *  An implementation of Ack Vectors for the DCCP protocol
+ *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  *
  *      This program is free software; you can redistribute it and/or modify it
@@ -23,24 +24,32 @@
 static struct kmem_cache *dccp_ackvec_slab;
 static struct kmem_cache *dccp_ackvec_record_slab;
 
-static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
 {
-	struct dccp_ackvec_record *avr =
-			kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+	struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
+
+	if (av != NULL) {
+		av->av_buf_head	= DCCPAV_MAX_ACKVEC_LEN - 1;
+		INIT_LIST_HEAD(&av->av_records);
+	}
+	return av;
+}
 
-	if (avr != NULL)
-		INIT_LIST_HEAD(&avr->avr_node);
+static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
+{
+	struct dccp_ackvec_record *cur, *next;
 
-	return avr;
+	list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
+		kmem_cache_free(dccp_ackvec_record_slab, cur);
+	INIT_LIST_HEAD(&av->av_records);
 }
 
-static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
+void dccp_ackvec_free(struct dccp_ackvec *av)
 {
-	if (unlikely(avr == NULL))
-		return;
-	/* Check if deleting a linked record */
-	WARN_ON(!list_empty(&avr->avr_node));
-	kmem_cache_free(dccp_ackvec_record_slab, avr);
+	if (likely(av != NULL)) {
+		dccp_ackvec_purge_records(av);
+		kmem_cache_free(dccp_ackvec_slab, av);
+	}
 }
 
 static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
@@ -68,24 +77,16 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
 	/* Figure out how many options do we need to represent the ackvec */
 	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = av->av_vec_len + 2 * nr_opts, i;
-	u32 elapsed_time;
+	u16 len = av->av_vec_len + 2 * nr_opts;
+	u8 i, nonce = 0;
 	const unsigned char *tail, *from;
 	unsigned char *to;
 	struct dccp_ackvec_record *avr;
-	suseconds_t delta;
 
 	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
 		return -1;
 
-	delta = ktime_us_delta(ktime_get_real(), av->av_time);
-	elapsed_time = delta / 10;
-
-	if (elapsed_time != 0 &&
-	    dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
-		return -1;
-
-	avr = dccp_ackvec_record_new();
+	avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
 	if (avr == NULL)
 		return -1;
 
@@ -94,7 +95,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	to   = skb_push(skb, len);
 	len  = av->av_vec_len;
 	from = av->av_buf + av->av_buf_head;
-	tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
+	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
 
 	for (i = 0; i < nr_opts; ++i) {
 		int copylen = len;
@@ -102,7 +103,13 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 		if (len > DCCP_SINGLE_OPT_MAXLEN)
 			copylen = DCCP_SINGLE_OPT_MAXLEN;
 
-		*to++ = DCCPO_ACK_VECTOR_0;
+		/*
+		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+		 * its type; ack_nonce is the sum of all individual buf_nonce's.
+		 */
+		nonce ^= av->av_buf_nonce[i];
+
+		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
 		*to++ = copylen + 2;
 
 		/* Check if buf_head wraps */
@@ -123,75 +130,24 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/*
-	 *	From RFC 4340, A.2:
-	 *
-	 *	For each acknowledgement it sends, the HC-Receiver will add an
-	 *	acknowledgement record.  ack_seqno will equal the HC-Receiver
-	 *	sequence number it used for the ack packet; ack_ptr will equal
-	 *	buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
-	 *	equal buf_nonce.
+	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
 	 */
-	avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
-	avr->avr_ack_ptr   = av->av_buf_head;
-	avr->avr_ack_ackno = av->av_buf_ackno;
-	avr->avr_ack_nonce = av->av_buf_nonce;
-	avr->avr_sent_len  = av->av_vec_len;
+	avr->avr_ack_seqno  = DCCP_SKB_CB(skb)->dccpd_seq;
+	avr->avr_ack_ptr    = av->av_buf_head;
+	avr->avr_ack_ackno  = av->av_buf_ackno;
+	avr->avr_ack_nonce  = nonce;
+	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
 
 	dccp_ackvec_insert_avr(av, avr);
 
 	dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
 		      "ack_ackno=%llu\n",
-		      dccp_role(sk), avr->avr_sent_len,
+		      dccp_role(sk), avr->avr_ack_runlen,
 		      (unsigned long long)avr->avr_ack_seqno,
 		      (unsigned long long)avr->avr_ack_ackno);
 	return 0;
 }
 
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
-	struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
-
-	if (av != NULL) {
-		av->av_buf_head	 = DCCP_MAX_ACKVEC_LEN - 1;
-		av->av_buf_ackno = UINT48_MAX + 1;
-		av->av_buf_nonce = 0;
-		av->av_time	 = ktime_set(0, 0);
-		av->av_vec_len	 = 0;
-		INIT_LIST_HEAD(&av->av_records);
-	}
-
-	return av;
-}
-
-void dccp_ackvec_free(struct dccp_ackvec *av)
-{
-	if (unlikely(av == NULL))
-		return;
-
-	if (!list_empty(&av->av_records)) {
-		struct dccp_ackvec_record *avr, *next;
-
-		list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
-			list_del_init(&avr->avr_node);
-			dccp_ackvec_record_delete(avr);
-		}
-	}
-
-	kmem_cache_free(dccp_ackvec_slab, av);
-}
-
-static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
-				   const u32 index)
-{
-	return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
-}
-
-static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
-				 const u32 index)
-{
-	return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
-}
-
 /*
  * If several packets are missing, the HC-Receiver may prefer to enter multiple
  * bytes with run length 0, rather than a single byte with a larger run length;
@@ -204,7 +160,7 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 	unsigned int gap;
 	long new_head;
 
-	if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
+	if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN)
 		return -ENOBUFS;
 
 	gap	 = packets - 1;
@@ -212,18 +168,18 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 
 	if (new_head < 0) {
 		if (gap > 0) {
-			memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
+			memset(av->av_buf, DCCPAV_NOT_RECEIVED,
 			       gap + new_head + 1);
 			gap = -new_head;
 		}
-		new_head += DCCP_MAX_ACKVEC_LEN;
+		new_head += DCCPAV_MAX_ACKVEC_LEN;
 	}
 
 	av->av_buf_head = new_head;
 
 	if (gap > 0)
 		memset(av->av_buf + av->av_buf_head + 1,
-		       DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
+		       DCCPAV_NOT_RECEIVED, gap);
 
 	av->av_buf[av->av_buf_head] = state;
 	av->av_vec_len += packets;
@@ -236,6 +192,8 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 		    const u64 ackno, const u8 state)
 {
+	u8 *cur_head = av->av_buf + av->av_buf_head,
+	   *buf_end  = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
 	/*
 	 * Check at the right places if the buffer is full, if it is, tell the
 	 * caller to start dropping packets till the HC-Sender acks our ACK
@@ -260,7 +218,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 
 	/* See if this is the first ackno being inserted */
 	if (av->av_vec_len == 0) {
-		av->av_buf[av->av_buf_head] = state;
+		*cur_head = state;
 		av->av_vec_len = 1;
 	} else if (after48(ackno, av->av_buf_ackno)) {
 		const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
@@ -269,10 +227,9 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 		 * Look if the state of this packet is the same as the
 		 * previous ackno and if so if we can bump the head len.
 		 */
-		if (delta == 1 &&
-		    dccp_ackvec_state(av, av->av_buf_head) == state &&
-		    dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
-			av->av_buf[av->av_buf_head]++;
+		if (delta == 1 && dccp_ackvec_state(cur_head) == state &&
+		    dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN)
+			*cur_head += 1;
 		else if (dccp_ackvec_set_buf_head_state(av, delta, state))
 			return -ENOBUFS;
 	} else {
@@ -285,21 +242,17 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 		 *	could reduce the complexity of this scan.)
 		 */
 		u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
-		u32 index = av->av_buf_head;
 
 		while (1) {
-			const u8 len = dccp_ackvec_len(av, index);
-			const u8 av_state = dccp_ackvec_state(av, index);
+			const u8 len = dccp_ackvec_runlen(cur_head);
 			/*
 			 * valid packets not yet in av_buf have a reserved
 			 * entry, with a len equal to 0.
 			 */
-			if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
-			    len == 0 && delta == 0) { /* Found our
-							 reserved seat! */
+			if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) {
 				dccp_pr_debug("Found %llu reserved seat!\n",
 					      (unsigned long long)ackno);
-				av->av_buf[index] = state;
+				*cur_head = state;
 				goto out;
 			}
 			/* len == 0 means one packet */
@@ -307,13 +260,12 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 				goto out_duplicate;
 
 			delta -= len + 1;
-			if (++index == DCCP_MAX_ACKVEC_LEN)
-				index = 0;
+			if (++cur_head == buf_end)
+				cur_head = av->av_buf;
 		}
 	}
 
 	av->av_buf_ackno = ackno;
-	av->av_time = ktime_get_real();
 out:
 	return 0;
 
@@ -333,13 +285,13 @@ static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
 	if (av->av_buf_head <= avr->avr_ack_ptr)
 		av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
 	else
-		av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
+		av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 -
 				 av->av_buf_head + avr->avr_ack_ptr;
 
 	/* free records */
 	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
-		list_del_init(&avr->avr_node);
-		dccp_ackvec_record_delete(avr);
+		list_del(&avr->avr_node);
+		kmem_cache_free(dccp_ackvec_record_slab, avr);
 	}
 }
 
@@ -357,7 +309,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
 		if (ackno == avr->avr_ack_seqno) {
 			dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
 				      "ack_ackno=%llu, ACKED!\n",
-				      dccp_role(sk), 1,
+				      dccp_role(sk), avr->avr_ack_runlen,
 				      (unsigned long long)avr->avr_ack_seqno,
 				      (unsigned long long)avr->avr_ack_ackno);
 			dccp_ackvec_throw_record(av, avr);
@@ -387,7 +339,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
 	 */
 	avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
 	while (i--) {
-		const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+		const u8 rl = dccp_ackvec_runlen(vector);
 		u64 ackno_end_rl;
 
 		dccp_set_seqno(&ackno_end_rl, *ackno - rl);
@@ -404,8 +356,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
 		break;
 found:
 		if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
-			const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
-			if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
+			if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) {
 				dccp_pr_debug("%s ACK vector 0, len=%d, "
 					      "ack_seqno=%llu, ack_ackno=%llu, "
 					      "ACKED!\n",
@@ -448,10 +399,9 @@ int __init dccp_ackvec_init(void)
 	if (dccp_ackvec_slab == NULL)
 		goto out_err;
 
-	dccp_ackvec_record_slab =
-			kmem_cache_create("dccp_ackvec_record",
-					  sizeof(struct dccp_ackvec_record),
-					  0, SLAB_HWCACHE_ALIGN, NULL);
+	dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
+					     sizeof(struct dccp_ackvec_record),
+					     0, SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_record_slab == NULL)
 		goto out_destroy_slab;
 
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 1c10814fdf7..df18f9030db 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,9 +3,9 @@
 /*
  *  net/dccp/ackvec.h
  *
- *  An implementation of the DCCP protocol
+ *  An implementation of Ack Vectors for the DCCP protocol
+ *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
  *	This program is free software; you can redistribute it and/or modify it
  *	under the terms of the GNU General Public License version 2 as
  *	published by the Free Software Foundation.
@@ -13,75 +13,84 @@
 
 #include <linux/dccp.h>
 #include <linux/compiler.h>
-#include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
-/* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
+/*
+ * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
+ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
+ * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
+ * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ */
+#define DCCPAV_NUM_ACKVECS	2
+#define DCCPAV_MAX_ACKVEC_LEN	(DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
 
 /* Estimated minimum average Ack Vector length - used for updating MPS */
 #define DCCPAV_MIN_OPTLEN	16
 
-#define DCCP_ACKVEC_STATE_RECEIVED	0
-#define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
-#define DCCP_ACKVEC_STATE_NOT_RECEIVED	(3 << 6)
+enum dccp_ackvec_states {
+	DCCPAV_RECEIVED =	0x00,
+	DCCPAV_ECN_MARKED =	0x40,
+	DCCPAV_RESERVED =	0x80,
+	DCCPAV_NOT_RECEIVED =	0xC0
+};
+#define DCCPAV_MAX_RUNLEN	0x3F
 
-#define DCCP_ACKVEC_STATE_MASK		0xC0 /* 11000000 */
-#define DCCP_ACKVEC_LEN_MASK		0x3F /* 00111111 */
+static inline u8 dccp_ackvec_runlen(const u8 *cell)
+{
+	return *cell & DCCPAV_MAX_RUNLEN;
+}
 
-/** struct dccp_ackvec - ack vector
- *
- * This data structure is the one defined in RFC 4340, Appendix A.
- *
- * @av_buf_head - circular buffer head
- * @av_buf_tail - circular buffer tail
- * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
- *		       buffer (i.e. %av_buf_head)
- * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
- * 		       by the buffer with State 0
- *
- * Additionally, the HC-Receiver must keep some information about the
- * Ack Vectors it has recently sent. For each packet sent carrying an
- * Ack Vector, it remembers four variables:
+static inline u8 dccp_ackvec_state(const u8 *cell)
+{
+	return *cell & ~DCCPAV_MAX_RUNLEN;
+}
+
+/** struct dccp_ackvec - Ack Vector main data structure
  *
- * @av_records - list of dccp_ackvec_record
- * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ * This implements a fixed-size circular buffer within an array and is largely
+ * based on Appendix A of RFC 4340.
  *
- * @av_time - the time in usecs
- * @av_buf - circular buffer of acknowledgeable packets
+ * @av_buf:	   circular buffer storage area
+ * @av_buf_head:   head index; begin of live portion in @av_buf
+ * @av_buf_tail:   tail index; first index _after_ the live portion in @av_buf
+ * @av_buf_ackno:  highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_buf_nonce:  ECN nonce sums, each covering subsequent segments of up to
+ *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
+ * @av_veclen:	   length of the live portion of @av_buf
  */
 struct dccp_ackvec {
-	u64			av_buf_ackno;
-	struct list_head	av_records;
-	ktime_t			av_time;
+	u8			av_buf[DCCPAV_MAX_ACKVEC_LEN];
 	u16			av_buf_head;
+	u16			av_buf_tail;
+	u64			av_buf_ackno:48;
+	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
+	struct list_head	av_records;
 	u16			av_vec_len;
-	u8			av_buf_nonce;
-	u8			av_ack_nonce;
-	u8			av_buf[DCCP_MAX_ACKVEC_LEN];
 };
 
-/** struct dccp_ackvec_record - ack vector record
+/** struct dccp_ackvec_record - Records information about sent Ack Vectors
  *
- * ACK vector record as defined in Appendix A of spec.
+ * These list entries define the additional information which the HC-Receiver
+ * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
  *
- * The list is sorted by avr_ack_seqno
+ * @avr_node:	    the list node in @av_records
+ * @avr_ack_seqno:  sequence number of the packet the Ack Vector was sent on
+ * @avr_ack_ackno:  the Ack number that this record/Ack Vector refers to
+ * @avr_ack_ptr:    pointer into @av_buf where this record starts
+ * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
+ * @avr_ack_nonce:  the sum of @av_buf_nonce's at the time this record was sent
  *
- * @avr_node - node in av_records
- * @avr_ack_seqno - sequence number of the packet this record was sent on
- * @avr_ack_ackno - sequence number being acknowledged
- * @avr_ack_ptr - pointer into av_buf where this record starts
- * @avr_ack_nonce - av_ack_nonce at the time this record was sent
- * @avr_sent_len - lenght of the record in av_buf
+ * The list as a whole is sorted in descending order by @avr_ack_seqno.
  */
 struct dccp_ackvec_record {
 	struct list_head avr_node;
-	u64		 avr_ack_seqno;
-	u64		 avr_ack_ackno;
+	u64		 avr_ack_seqno:48;
+	u64		 avr_ack_ackno:48;
 	u16		 avr_ack_ptr;
-	u16		 avr_sent_len;
-	u8		 avr_ack_nonce;
+	u8		 avr_ack_runlen;
+	u8		 avr_ack_nonce:1;
 };
 
 struct sock;
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index f56ab68a4b7..813d5cd40e8 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -580,8 +580,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 					 &vector, &veclen)) != -1) {
 		/* go through this ack vector */
 		while (veclen--) {
-			const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
-			u64 ackno_end_rl = SUB48(ackno, rl);
+			u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector));
 
 			ccid2_pr_debug("ackvec start:%llu end:%llu\n",
 				       (unsigned long long)ackno,
@@ -604,17 +603,15 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			 * run length
 			 */
 			while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
-				const u8 state = *vector &
-						 DCCP_ACKVEC_STATE_MASK;
+				const u8 state = dccp_ackvec_state(vector);
 
 				/* new packet received or marked */
-				if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+				if (state != DCCPAV_NOT_RECEIVED &&
 				    !seqp->ccid2s_acked) {
-					if (state ==
-					    DCCP_ACKVEC_STATE_ECN_MARKED) {
+					if (state == DCCPAV_ECN_MARKED)
 						ccid2_congestion_event(sk,
 								       seqp);
-					} else
+					else
 						ccid2_new_ack(sk, seqp,
 							      &maxincr);
 
diff --git a/net/dccp/input.c b/net/dccp/input.c
index e3f43d55e3c..70ad0ba7214 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -377,8 +377,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 	if (dp->dccps_hc_rx_ackvec != NULL &&
 	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-			    DCCP_SKB_CB(skb)->dccpd_seq,
-			    DCCP_ACKVEC_STATE_RECEIVED))
+			    DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
 		goto discard;
 	dccp_deliver_input_to_ccids(sk, skb);
 
@@ -627,8 +626,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		if (dp->dccps_hc_rx_ackvec != NULL &&
 		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-				    DCCP_SKB_CB(skb)->dccpd_seq,
-				    DCCP_ACKVEC_STATE_RECEIVED))
+				    DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
 			goto discard;
 
 		dccp_deliver_input_to_ccids(sk, skb);
-- 
cgit v1.2.3


From 4829007c7bc689cbc290fc09eccbe90bd52c2a5e Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Separate internals of Ack Vectors from option-parsing
 code

This patch
 * separates Ack Vector housekeeping code from option-insertion code;
 * shifts option-specific code from ackvec.c into options.c;
 * introduces a dedicated routine to take care of the Ack Vector records;
 * simplifies the dccp_ackvec_insert_avr() routine: the BUG_ON was redundant,
   since the list is automatically arranged in descending order of ack_seqno.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c  | 100 ++++++++++-------------------------------------------
 net/dccp/ackvec.h  |   5 ++-
 net/dccp/options.c |  60 ++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 85 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 85ad70cfba5..de84dd34c93 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -52,99 +52,35 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
 	}
 }
 
-static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
-				   struct dccp_ackvec_record *avr)
-{
-	/*
-	 * AVRs are sorted by seqno. Since we are sending them in order, we
-	 * just add the AVR at the head of the list.
-	 * -sorbo.
-	 */
-	if (!list_empty(&av->av_records)) {
-		const struct dccp_ackvec_record *head =
-					list_entry(av->av_records.next,
-						   struct dccp_ackvec_record,
-						   avr_node);
-		BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
-	}
-
-	list_add(&avr->avr_node, &av->av_records);
-}
-
-int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_ackvec_update_records  -  Record information about sent Ack Vectors
+ * @av:		Ack Vector records to update
+ * @seqno:	Sequence number of the packet carrying the Ack Vector just sent
+ * @nonce_sum:	The sum of all buffer nonces contained in the Ack Vector
+ */
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
-	/* Figure out how many options do we need to represent the ackvec */
-	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = av->av_vec_len + 2 * nr_opts;
-	u8 i, nonce = 0;
-	const unsigned char *tail, *from;
-	unsigned char *to;
 	struct dccp_ackvec_record *avr;
 
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
-		return -1;
-
 	avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
 	if (avr == NULL)
-		return -1;
-
-	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
-	to   = skb_push(skb, len);
-	len  = av->av_vec_len;
-	from = av->av_buf + av->av_buf_head;
-	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
-
-	for (i = 0; i < nr_opts; ++i) {
-		int copylen = len;
-
-		if (len > DCCP_SINGLE_OPT_MAXLEN)
-			copylen = DCCP_SINGLE_OPT_MAXLEN;
-
-		/*
-		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
-		 * its type; ack_nonce is the sum of all individual buf_nonce's.
-		 */
-		nonce ^= av->av_buf_nonce[i];
-
-		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
-		*to++ = copylen + 2;
-
-		/* Check if buf_head wraps */
-		if (from + copylen > tail) {
-			const u16 tailsize = tail - from;
-
-			memcpy(to, from, tailsize);
-			to	+= tailsize;
-			len	-= tailsize;
-			copylen	-= tailsize;
-			from	= av->av_buf;
-		}
-
-		memcpy(to, from, copylen);
-		from += copylen;
-		to   += copylen;
-		len  -= copylen;
-	}
+		return -ENOBUFS;
 
-	/*
-	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
-	 */
-	avr->avr_ack_seqno  = DCCP_SKB_CB(skb)->dccpd_seq;
+	avr->avr_ack_seqno  = seqno;
 	avr->avr_ack_ptr    = av->av_buf_head;
 	avr->avr_ack_ackno  = av->av_buf_ackno;
-	avr->avr_ack_nonce  = nonce;
+	avr->avr_ack_nonce  = nonce_sum;
 	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
+	/*
+	 * Since GSS is incremented for each packet, the list is automatically
+	 * arranged in descending order of @ack_seqno.
+	 */
+	list_add(&avr->avr_node, &av->av_records);
 
-	dccp_ackvec_insert_avr(av, avr);
-
-	dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
-		      "ack_ackno=%llu\n",
-		      dccp_role(sk), avr->avr_ack_runlen,
+	dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
 		      (unsigned long long)avr->avr_ack_seqno,
-		      (unsigned long long)avr->avr_ack_ackno);
+		      (unsigned long long)avr->avr_ack_ackno,
+		      avr->avr_ack_runlen);
 	return 0;
 }
 
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index df18f9030db..b34e5ed4c34 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -112,7 +112,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 			     u64 *ackno, const u8 opt,
 			     const u8 *value, const u8 len);
 
-extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
+extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
 
 static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
 {
@@ -155,8 +155,7 @@ static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 	return -1;
 }
 
-static inline int dccp_insert_option_ackvec(const struct sock *sk,
-					    const struct sk_buff *skb)
+static inline int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 nonce)
 {
 	return -1;
 }
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 9fe0510f402..392d7db3134 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -428,6 +428,66 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
 	return 0;
 }
 
+static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	/* Figure out how many options do we need to represent the ackvec */
+	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
+	u16 len = av->av_vec_len + 2 * nr_opts;
+	u8 i, nonce = 0;
+	const unsigned char *tail, *from;
+	unsigned char *to;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to   = skb_push(skb, len);
+	len  = av->av_vec_len;
+	from = av->av_buf + av->av_buf_head;
+	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+
+	for (i = 0; i < nr_opts; ++i) {
+		int copylen = len;
+
+		if (len > DCCP_SINGLE_OPT_MAXLEN)
+			copylen = DCCP_SINGLE_OPT_MAXLEN;
+
+		/*
+		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+		 * its type; ack_nonce is the sum of all individual buf_nonce's.
+		 */
+		nonce ^= av->av_buf_nonce[i];
+
+		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
+		*to++ = copylen + 2;
+
+		/* Check if buf_head wraps */
+		if (from + copylen > tail) {
+			const u16 tailsize = tail - from;
+
+			memcpy(to, from, tailsize);
+			to	+= tailsize;
+			len	-= tailsize;
+			copylen	-= tailsize;
+			from	= av->av_buf;
+		}
+
+		memcpy(to, from, copylen);
+		from += copylen;
+		to   += copylen;
+		len  -= copylen;
+	}
+	/*
+	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
+	 */
+	if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce))
+		return -ENOBUFS;
+	return 0;
+}
+
 /**
  * dccp_insert_option_mandatory  -  Mandatory option (5.8.2)
  * Note that since we are using skb_push, this function needs to be called
-- 
cgit v1.2.3


From d7dc7e5f49299739e610ea8febf9ea91a4dc1ae9 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Implementation of circular Ack Vector buffer with
 overflow handling

This completes the implementation of a circular buffer for Ack Vectors, by
extending the current (linear array-based) implementation.  The changes are:

 (a) An `overflow' flag to deal with the case of overflow. As before, dynamic
     growth of the buffer will not be supported; but code will be added to deal
     robustly with overflowing Ack Vector buffers.

 (b) A `tail_seqno' field. When naively implementing the algorithm of Appendix A
     in RFC 4340, problems arise whenever subsequent Ack Vector records overlap,
     which can bring the entire run length calculation completely out of synch.
     (This is documented on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/\
                                             ack_vectors/tracking_tail_ackno/ .)
 (c) The buffer lengthi is now computed dynamically (i.e. current fill level),
     as the span between head to tail.

As a result, dccp_ackvec_pending() is now simpler - the #ifdef is no longer
necessary since buf_empty is always true when IP_DCCP_ACKVEC is not configured.

Note on overflow handling:
-------------------------
 The Ack Vector code previously simply started to drop packets when the
 Ack Vector buffer overflowed. This means that the userspace application
 will not be able to receive, only because of an Ack Vector storage problem.

 Furthermore, overflow may be transient, so that applications may later
 recover from the overflow. Recovering from dropped packets is more difficult
 (e.g. video key frames).

 Hence the patch uses a different policy: when the buffer overflows, the oldest
 entries are subsequently overwritten. This has a higher chance of recovery.
 Details are on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ack_vectors/

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c  | 31 ++++++++++++++++++++++++++++++-
 net/dccp/ackvec.h  | 17 ++++++++++++++---
 net/dccp/dccp.h    | 14 +++++++-------
 net/dccp/options.c | 10 +++++-----
 4 files changed, 56 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index de84dd34c93..1184d5e5dc9 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -29,7 +29,7 @@ struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
 	struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
 
 	if (av != NULL) {
-		av->av_buf_head	= DCCPAV_MAX_ACKVEC_LEN - 1;
+		av->av_buf_head	= av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
 		INIT_LIST_HEAD(&av->av_records);
 	}
 	return av;
@@ -71,6 +71,14 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
 	avr->avr_ack_ackno  = av->av_buf_ackno;
 	avr->avr_ack_nonce  = nonce_sum;
 	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
+	/*
+	 * When the buffer overflows, we keep no more than one record. This is
+	 * the simplest way of disambiguating sender-Acks dating from before the
+	 * overflow from sender-Acks which refer to after the overflow; a simple
+	 * solution is preferable here since we are handling an exception.
+	 */
+	if (av->av_overflow)
+		dccp_ackvec_purge_records(av);
 	/*
 	 * Since GSS is incremented for each packet, the list is automatically
 	 * arranged in descending order of @ack_seqno.
@@ -84,6 +92,27 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
 	return 0;
 }
 
+/*
+ * Buffer index and length computation using modulo-buffersize arithmetic.
+ * Note that, as pointers move from right to left, head is `before' tail.
+ */
+static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
+{
+	return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
+}
+
+static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
+{
+	return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
+}
+
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
+{
+	if (unlikely(av->av_overflow))
+		return DCCPAV_MAX_ACKVEC_LEN;
+	return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
+}
+
 /*
  * If several packets are missing, the HC-Receiver may prefer to enter multiple
  * bytes with run length 0, rather than a single byte with a larger run length;
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index b34e5ed4c34..92f65b0fef5 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -21,6 +21,7 @@
  * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
  * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
  * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ * The maximum value is bounded by the u16 types for indices and functions.
  */
 #define DCCPAV_NUM_ACKVECS	2
 #define DCCPAV_MAX_ACKVEC_LEN	(DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
@@ -55,8 +56,10 @@ static inline u8 dccp_ackvec_state(const u8 *cell)
  * @av_buf_head:   head index; begin of live portion in @av_buf
  * @av_buf_tail:   tail index; first index _after_ the live portion in @av_buf
  * @av_buf_ackno:  highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_tail_ackno: lowest  seqno of acknowledgeable packet recorded in @av_buf
  * @av_buf_nonce:  ECN nonce sums, each covering subsequent segments of up to
  *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_overflow:   if 1 then buf_head == buf_tail indicates buffer wraparound
  * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
  * @av_veclen:	   length of the live portion of @av_buf
  */
@@ -65,7 +68,9 @@ struct dccp_ackvec {
 	u16			av_buf_head;
 	u16			av_buf_tail;
 	u64			av_buf_ackno:48;
+	u64			av_tail_ackno:48;
 	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
+	u8			av_overflow:1;
 	struct list_head	av_records;
 	u16			av_vec_len;
 };
@@ -113,10 +118,11 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 			     const u8 *value, const u8 len);
 
 extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+extern u16  dccp_ackvec_buflen(const struct dccp_ackvec *av);
 
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
 {
-	return av->av_vec_len;
+	return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
 }
 #else /* CONFIG_IP_DCCP_ACKVEC */
 static inline int dccp_ackvec_init(void)
@@ -160,9 +166,14 @@ static inline int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8
 	return -1;
 }
 
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+static inline u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
 {
 	return 0;
 }
+
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
+{
+	return true;
+}
 #endif /* CONFIG_IP_DCCP_ACKVEC */
 #endif /* _ACKVEC_H */
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index e4d6e76ced4..1e65378eea3 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -448,15 +448,15 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
 	dp->dccps_awh = dp->dccps_gss;
 }
 
+static inline int dccp_ackvec_pending(const struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
+	       !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
+}
+
 static inline int dccp_ack_pending(const struct sock *sk)
 {
-	const struct dccp_sock *dp = dccp_sk(sk);
-	return
-#ifdef CONFIG_IP_DCCP_ACKVEC
-	       (dp->dccps_hc_rx_ackvec != NULL &&
-		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
-#endif
-	       inet_csk_ack_scheduled(sk);
+	return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
 }
 
 extern int  dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 392d7db3134..3163ae980f1 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -432,9 +432,10 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	const u16 buflen = dccp_ackvec_buflen(av);
 	/* Figure out how many options do we need to represent the ackvec */
-	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = av->av_vec_len + 2 * nr_opts;
+	const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
+	u16 len = buflen + 2 * nr_opts;
 	u8 i, nonce = 0;
 	const unsigned char *tail, *from;
 	unsigned char *to;
@@ -445,7 +446,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
 
 	to   = skb_push(skb, len);
-	len  = av->av_vec_len;
+	len  = buflen;
 	from = av->av_buf + av->av_buf_head;
 	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
 
@@ -583,8 +584,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 			if (dccp_insert_option_timestamp(sk, skb))
 				return -1;
 
-		} else if (dp->dccps_hc_rx_ackvec != NULL &&
-			   dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
+		} else if (dccp_ackvec_pending(sk) &&
 			   dccp_insert_option_ackvec(sk, skb)) {
 				return -1;
 		}
-- 
cgit v1.2.3


From 68b1de15765f2b0e0925e692dab2b2fa2abd93fc Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Algorithm to update buffer state

This provides a routine to consistently update the buffer state when the
peer acknowledges receipt of Ack Vectors; updating state in the list of Ack
Vectors as well as in the circular buffer.

While based on RFC 4340, several additional (and necessary) precautions were
added to protect the consistency of the buffer state. These additions are
essential, since analysis and experience showed that the basic algorithm was
insufficient for this task (which lead to problems that were hard to debug).

The algorithm now
 * deals with HC-sender acknowledging to HC-receiver and vice versa,
 * keeps track of the last unacknowledged but received seqno in tail_ackno,
 * has special cases to reset the overflow condition when appropriate,
 * is protected against receiving older information (would mess up buffer state).

Note: The older code performed an unnecessary step, where the sender cleared
Ack Vector state by parsing the Ack Vector received by the HC-receiver. Doing
this was entirely redundant, since
 * the receiver always puts the full acknowledgment window (groups 2,3 in 11.4.2)
   into the Ack Vectors it sends; hence the HC-receiver is only interested in the
   highest state that the HC-sender received;
 * this means that the acknowledgment number on the (Data)Ack from the HC-sender
   is sufficient; and work done in parsing earlier state is not necessary, since
   the later state subsumes the  earlier one (see also RFC 4340, A.4).
This older interface (dccp_ackvec_parse()) is therefore removed.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c  | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/ackvec.h  |  6 ++++
 net/dccp/input.c   |  4 +--
 net/dccp/options.c |  6 ++--
 4 files changed, 98 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 1184d5e5dc9..f1341a617f9 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -92,6 +92,24 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
 	return 0;
 }
 
+static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
+						     const u64 ackno)
+{
+	struct dccp_ackvec_record *avr;
+	/*
+	 * Exploit that records are inserted in descending order of sequence
+	 * number, start with the oldest record first. If @ackno is `before'
+	 * the earliest ack_ackno, the packet is too old to be considered.
+	 */
+	list_for_each_entry_reverse(avr, av_list, avr_node) {
+		if (avr->avr_ack_seqno == ackno)
+			return avr;
+		if (before48(ackno, avr->avr_ack_seqno))
+			break;
+	}
+	return NULL;
+}
+
 /*
  * Buffer index and length computation using modulo-buffersize arithmetic.
  * Note that, as pointers move from right to left, head is `before' tail.
@@ -356,6 +374,76 @@ int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 	return 0;
 }
 
+/**
+ * dccp_ackvec_clear_state  -  Perform house-keeping / garbage-collection
+ * This routine is called when the peer acknowledges the receipt of Ack Vectors
+ * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * are additional precautions to prevent corrupted buffer state. In particular,
+ * we use tail_ackno to identify outdated records; it always marks the earliest
+ * packet of group (2) in 11.4.2.
+ */
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
+ {
+	struct dccp_ackvec_record *avr, *next;
+	u8 runlen_now, eff_runlen;
+	s64 delta;
+
+	avr = dccp_ackvec_lookup(&av->av_records, ackno);
+	if (avr == NULL)
+		return;
+	/*
+	 * Deal with outdated acknowledgments: this arises when e.g. there are
+	 * several old records and the acks from the peer come in slowly. In
+	 * that case we may still have records that pre-date tail_ackno.
+	 */
+	delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
+	if (delta < 0)
+		goto free_records;
+	/*
+	 * Deal with overlapping Ack Vectors: don't subtract more than the
+	 * number of packets between tail_ackno and ack_ackno.
+	 */
+	eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
+
+	runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
+	/*
+	 * The run length of Ack Vector cells does not decrease over time. If
+	 * the run length is the same as at the time the Ack Vector was sent, we
+	 * free the ack_ptr cell. That cell can however not be freed if the run
+	 * length has increased: in this case we need to move the tail pointer
+	 * backwards (towards higher indices), to its next-oldest neighbour.
+	 */
+	if (runlen_now > eff_runlen) {
+
+		av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
+		av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+
+		/* This move may not have cleared the overflow flag. */
+		if (av->av_overflow)
+			av->av_overflow = (av->av_buf_head == av->av_buf_tail);
+	} else {
+		av->av_buf_tail	= avr->avr_ack_ptr;
+		/*
+		 * We have made sure that avr points to a valid cell within the
+		 * buffer. This cell is either older than head, or equals head
+		 * (empty buffer): in both cases we no longer have any overflow.
+		 */
+		av->av_overflow	= 0;
+	}
+
+	/*
+	 * The peer has acknowledged up to and including ack_ackno. Hence the
+	 * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
+	 */
+	av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+
+free_records:
+	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+		list_del(&avr->avr_node);
+		kmem_cache_free(dccp_ackvec_record_slab, avr);
+	}
+}
+
 int __init dccp_ackvec_init(void)
 {
 	dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 92f65b0fef5..b757e9b4110 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -118,6 +118,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 			     const u8 *value, const u8 len);
 
 extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
 extern u16  dccp_ackvec_buflen(const struct dccp_ackvec *av);
 
 static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
@@ -149,6 +150,11 @@ static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 	return -1;
 }
 
+static inline void dccp_ackvec_clear_state(struct dccp_ackvec *av,
+					    const u64 ackno)
+{
+}
+
 static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
 					       struct sock *sk, const u64 ackno)
 {
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 70ad0ba7214..77a5d57ab70 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -164,8 +164,8 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
 	struct dccp_sock *dp = dccp_sk(sk);
 
 	if (dp->dccps_hc_rx_ackvec != NULL)
-		dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
-					    DCCP_SKB_CB(skb)->dccpd_ack_seq);
+		dccp_ackvec_clear_state(dp->dccps_hc_rx_ackvec,
+					DCCP_SKB_CB(skb)->dccpd_ack_seq);
 }
 
 static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 3163ae980f1..b11d7b7167f 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	struct dccp_sock *dp = dccp_sk(sk);
 	const struct dccp_hdr *dh = dccp_hdr(skb);
 	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
-	u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
 	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
 	unsigned char *opt_ptr = options;
 	const unsigned char *opt_end = (unsigned char *)dh +
@@ -133,9 +132,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		case DCCPO_ACK_VECTOR_1:
 			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
 				break;
-			if (dp->dccps_hc_rx_ackvec != NULL &&
-			    dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
-				goto out_invalid_option;
+			dccp_pr_debug("%s Ack Vector (len=%u)\n", dccp_role(sk),
+				      len);
 			break;
 		case DCCPO_TIMESTAMP:
 			if (len != 4)
-- 
cgit v1.2.3


From e28fe59f9c82ef55fc9b55e745531c9fed86f00a Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Update code for the Ack Vector input/registration
 routine

This patch uupdates the code which registers new packets as received, using the
new circular buffer interface. It contributes a new algorithm which
	* supports both tail/head pointers and buffer wrap-around and
	* deals with overflow (head/tail move in lock-step).

The updated code is also partioned differently, into
	1. dealing with the empty buffer,
	2. adding new packets into non-empty buffer,
	3. reserving space when encountering a `hole' in the sequence space,
	4. updating old state and deciding when old state is irrelevant.

Protection against large burst losses: With regard to (3), it is too costly to
reserve space when there are large bursts of losses. When bursts get too large,
the code does no longer reserve space and just fills in cells normally. This
measure reduces space consumption by a factor of 63.

The code reuses in part the previous implementation by Arnaldo de Melo.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/ackvec.h |   9 ++++
 2 files changed, 159 insertions(+)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index f1341a617f9..bf9cb7d7549 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -131,6 +131,156 @@ u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
 	return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
 }
 
+/**
+ * dccp_ackvec_update_old  -  Update previous state as per RFC 4340, 11.4.1
+ * @av:		non-empty buffer to update
+ * @distance:   negative or zero distance of @seqno from buf_ackno downward
+ * @seqno:	the (old) sequence number whose record is to be updated
+ * @state:	state in which packet carrying @seqno was received
+ */
+static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
+				   u64 seqno, enum dccp_ackvec_states state)
+{
+	u16 ptr = av->av_buf_head;
+
+	BUG_ON(distance > 0);
+	if (unlikely(dccp_ackvec_is_empty(av)))
+		return;
+
+	do {
+		u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
+
+		if (distance + runlen >= 0) {
+			/*
+			 * Only update the state if packet has not been received
+			 * yet. This is OK as per the second table in RFC 4340,
+			 * 11.4.1; i.e. here we are using the following table:
+			 *                     RECEIVED
+			 *                      0   1   3
+			 *              S     +---+---+---+
+			 *              T   0 | 0 | 0 | 0 |
+			 *              O     +---+---+---+
+			 *              R   1 | 1 | 1 | 1 |
+			 *              E     +---+---+---+
+			 *              D   3 | 0 | 1 | 3 |
+			 *                    +---+---+---+
+			 * The "Not Received" state was set by reserve_seats().
+			 */
+			if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
+				av->av_buf[ptr] = state;
+			else
+				dccp_pr_debug("Not changing %llu state to %u\n",
+					      (unsigned long long)seqno, state);
+			break;
+		}
+
+		distance += runlen + 1;
+		ptr	  = __ackvec_idx_add(ptr, 1);
+
+	} while (ptr != av->av_buf_tail);
+}
+
+/* Mark @num entries after buf_head as "Not yet received". */
+static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
+{
+	u16 start = __ackvec_idx_add(av->av_buf_head, 1),
+	    len	  = DCCPAV_MAX_ACKVEC_LEN - start;
+
+	/* check for buffer wrap-around */
+	if (num > len) {
+		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
+		start = 0;
+		num  -= len;
+	}
+	if (num)
+		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
+}
+
+/**
+ * dccp_ackvec_add_new  -  Record one or more new entries in Ack Vector buffer
+ * @av:		 container of buffer to update (can be empty or non-empty)
+ * @num_packets: number of packets to register (must be >= 1)
+ * @seqno:	 sequence number of the first packet in @num_packets
+ * @state:	 state in which packet carrying @seqno was received
+ */
+static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
+				u64 seqno, enum dccp_ackvec_states state)
+{
+	u32 num_cells = num_packets;
+
+	if (num_packets > DCCPAV_BURST_THRESH) {
+		u32 lost_packets = num_packets - 1;
+
+		DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
+		/*
+		 * We received 1 packet and have a loss of size "num_packets-1"
+		 * which we squeeze into num_cells-1 rather than reserving an
+		 * entire byte for each lost packet.
+		 * The reason is that the vector grows in O(burst_length); when
+		 * it grows too large there will no room left for the payload.
+		 * This is a trade-off: if a few packets out of the burst show
+		 * up later, their state will not be changed; it is simply too
+		 * costly to reshuffle/reallocate/copy the buffer each time.
+		 * Should such problems persist, we will need to switch to a
+		 * different underlying data structure.
+		 */
+		for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
+			u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
+
+			av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
+			av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+
+			lost_packets -= len;
+		}
+	}
+
+	if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
+		DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+		av->av_overflow = true;
+	}
+
+	av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
+	if (av->av_overflow)
+		av->av_buf_tail = av->av_buf_head;
+
+	av->av_buf[av->av_buf_head] = state;
+	av->av_buf_ackno	    = seqno;
+
+	if (num_packets > 1)
+		dccp_ackvec_reserve_seats(av, num_packets - 1);
+}
+
+/**
+ * dccp_ackvec_input  -  Register incoming packet in the buffer
+ */
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
+{
+	u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+	enum dccp_ackvec_states state = DCCPAV_RECEIVED;
+
+	if (dccp_ackvec_is_empty(av)) {
+		dccp_ackvec_add_new(av, 1, seqno, state);
+		av->av_tail_ackno = seqno;
+
+	} else {
+		s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
+		u8 *current_head = av->av_buf + av->av_buf_head;
+
+		if (num_packets == 1 &&
+		    dccp_ackvec_state(current_head) == state &&
+		    dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
+
+			*current_head   += 1;
+			av->av_buf_ackno = seqno;
+
+		} else if (num_packets > 0) {
+			dccp_ackvec_add_new(av, num_packets, seqno, state);
+		} else {
+			dccp_ackvec_update_old(av, num_packets, seqno, state);
+		}
+	}
+}
+
 /*
  * If several packets are missing, the HC-Receiver may prefer to enter multiple
  * bytes with run length 0, rather than a single byte with a larger run length;
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index b757e9b4110..36ca2e9e5c8 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -29,6 +29,9 @@
 /* Estimated minimum average Ack Vector length - used for updating MPS */
 #define DCCPAV_MIN_OPTLEN	16
 
+/* Threshold for coping with large bursts of losses */
+#define DCCPAV_BURST_THRESH	(DCCPAV_MAX_ACKVEC_LEN / 8)
+
 enum dccp_ackvec_states {
 	DCCPAV_RECEIVED =	0x00,
 	DCCPAV_ECN_MARKED =	0x40,
@@ -117,6 +120,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 			     u64 *ackno, const u8 opt,
 			     const u8 *value, const u8 len);
 
+extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
 extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
 extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
 extern u16  dccp_ackvec_buflen(const struct dccp_ackvec *av);
@@ -144,6 +148,11 @@ static inline void dccp_ackvec_free(struct dccp_ackvec *av)
 {
 }
 
+static inline void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
+{
+
+}
+
 static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 				  const u64 ackno, const u8 state)
 {
-- 
cgit v1.2.3


From 283fb4a5f39d1521d53e1044bff0ba2654acf145 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Consolidate Ack-Vector processing within main DCCP
 module

This aggregates Ack Vector processing (handling input and clearing old state)
into one function, for the following reasons and benefits:
 * all Ack Vector-specific processing is now in one place;
 * duplicated code is removed;
 * ensuring sanity: from an Ack Vector point of view, it is better to clear the
                    old state first before entering new state;
 * Ack Event handling happens mostly within the CCIDs, not the main DCCP module.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/input.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 77a5d57ab70..9a108ce17fc 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -159,13 +159,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
 	dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
 }
 
-static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
 
-	if (dp->dccps_hc_rx_ackvec != NULL)
-		dccp_ackvec_clear_state(dp->dccps_hc_rx_ackvec,
-					DCCP_SKB_CB(skb)->dccpd_ack_seq);
+	if (av == NULL)
+		return;
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
+	dccp_ackvec_input(av, skb);
 }
 
 static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -364,21 +366,13 @@ discard:
 int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 const struct dccp_hdr *dh, const unsigned len)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-
 	if (dccp_check_seqno(sk, skb))
 		goto discard;
 
 	if (dccp_parse_options(sk, NULL, skb))
 		return 1;
 
-	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
-		dccp_event_ack_recv(sk, skb);
-
-	if (dp->dccps_hc_rx_ackvec != NULL &&
-	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-			    DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
-		goto discard;
+	dccp_handle_ackvec_processing(sk, skb);
 	dccp_deliver_input_to_ccids(sk, skb);
 
 	return __dccp_rcv_established(sk, skb, dh, len);
@@ -621,14 +615,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		if (dccp_parse_options(sk, NULL, skb))
 			return 1;
 
-		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
-			dccp_event_ack_recv(sk, skb);
-
-		if (dp->dccps_hc_rx_ackvec != NULL &&
-		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-				    DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
-			goto discard;
-
+		dccp_handle_ackvec_processing(sk, skb);
 		dccp_deliver_input_to_ccids(sk, skb);
 	}
 
-- 
cgit v1.2.3


From c2f42077bd06f300ae959204f3c007f820f5e769 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Schedule Sync as out-of-band mechanism

The problem with Ack Vectors is that

  i) their length is variable and can in principle grow quite large,
 ii) it is hard to predict exactly how large they will be.

Due to the second point it seems not a good idea to reduce the MPS; in
particular when on average there is enough room for the Ack Vector and an
increase in length is momentarily due to some burst loss, after which the
Ack Vector returns to its normal/average length.

The solution taken by this patch is to subtract a minimum-expected Ack Vector
length from the MPS (previous patch), and to defer any larger Ack Vectors onto
a separate Sync - but only if indeed there is no space left on the skb.

This patch provides the infrastructure to schedule Sync-packets for transporting
(urgent) out-of-band data. Its signalling is quicker than scheduling an Ack, since
it does not need to wait for new application data.

It can thus serve other parts of the DCCP code as well.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c | 24 ++++++++++++++++++++----
 net/dccp/output.c  |  8 ++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index b11d7b7167f..791e07853a7 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -430,6 +430,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
 	const u16 buflen = dccp_ackvec_buflen(av);
 	/* Figure out how many options do we need to represent the ackvec */
 	const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
@@ -438,10 +439,25 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	const unsigned char *tail, *from;
 	unsigned char *to;
 
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+	if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+		DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
+			  dccp_packet_name(dcb->dccpd_type));
 		return -1;
-
-	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+	}
+	/*
+	 * Since Ack Vectors are variable-length, we can not always predict
+	 * their size. To catch exception cases where the space is running out
+	 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
+	 */
+	if (len > DCCPAV_MIN_OPTLEN &&
+	    len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
+		DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
+			  "MPS=%u ==> reduce payload size?\n", len, skb->len,
+			  dcb->dccpd_opt_len, dp->dccps_mss_cache);
+		dp->dccps_sync_scheduled = 1;
+		return 0;
+	}
+	dcb->dccpd_opt_len += len;
 
 	to   = skb_push(skb, len);
 	len  = buflen;
@@ -482,7 +498,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	/*
 	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
 	 */
-	if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce))
+	if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
 		return -ENOBUFS;
 	return 0;
 }
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 1b316830758..bfda071559f 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -305,6 +305,8 @@ void dccp_write_xmit(struct sock *sk, int block)
 			if (err)
 				DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
 					 err);
+			if (dp->dccps_sync_scheduled)
+				dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
 		} else {
 			dccp_pr_debug("packet discarded due to err=%d\n", err);
 			kfree_skb(skb);
@@ -591,6 +593,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
 	DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
 	DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
 
+	/*
+	 * Clear the flag in case the Sync was scheduled for out-of-band data,
+	 * such as carrying a long Ack Vector.
+	 */
+	dccp_sk(sk)->dccps_sync_scheduled = 0;
+
 	dccp_transmit_skb(sk, skb);
 }
 
-- 
cgit v1.2.3


From 5a577b488f687f339dea62e7bb4f4c5793ad523f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Remove old infrastructure

This removes
 * functions for which updates have been provided in the preceding patches and
 * the @av_vec_len field - it is no longer necessary since the buffer length is
   now always computed dynamically;
 * conditional debugging code (CONFIG_IP_DCCP_ACKVEC).

The reason for removing the conditional debugging code is that Ack Vectors are
an almost inevitable necessity - RFC 4341 says that for CCID-2, Ack Vectors must
be used. Furthermore, the code would be only interesting for coding - after some
extensive testing with this patch set, having the debug code around is no longer
of real help.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/Kconfig       |   3 -
 net/dccp/Makefile      |   5 +-
 net/dccp/ackvec.c      | 251 -------------------------------------------------
 net/dccp/ackvec.h      |  79 +---------------
 net/dccp/ccids/Kconfig |   1 -
 5 files changed, 3 insertions(+), 336 deletions(-)

(limited to 'net')

diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 7aa2a7acc7e..206c16ad9c3 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -25,9 +25,6 @@ config INET_DCCP_DIAG
 	def_tristate y if (IP_DCCP = y && INET_DIAG = y)
 	def_tristate m
 
-config IP_DCCP_ACKVEC
-	bool
-
 source "net/dccp/ccids/Kconfig"
 
 menu "DCCP Kernel Hacking"
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index f4f8793aaff..b68440bd7fa 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
 
-dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
+dccp-y := ccid.o feat.o input.o minisocks.o options.o \
+	  output.o proto.o timer.o ackvec.o
 
 dccp_ipv4-y := ipv4.o
 
@@ -8,8 +9,6 @@ dccp_ipv4-y := ipv4.o
 obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
 dccp_ipv6-y := ipv6.o
 
-dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
-
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
 obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
 
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index bf9cb7d7549..66b8a51300c 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -9,18 +9,10 @@
  *      under the terms of the GNU General Public License as published by the
  *      Free Software Foundation; version 2 of the License;
  */
-
-#include "ackvec.h"
 #include "dccp.h"
-
-#include <linux/init.h>
-#include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/skbuff.h>
 #include <linux/slab.h>
 
-#include <net/sock.h>
-
 static struct kmem_cache *dccp_ackvec_slab;
 static struct kmem_cache *dccp_ackvec_record_slab;
 
@@ -281,249 +273,6 @@ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
 	}
 }
 
-/*
- * If several packets are missing, the HC-Receiver may prefer to enter multiple
- * bytes with run length 0, rather than a single byte with a larger run length;
- * this simplifies table updates if one of the missing packets arrives.
- */
-static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
-						 const unsigned int packets,
-						 const unsigned char state)
-{
-	unsigned int gap;
-	long new_head;
-
-	if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN)
-		return -ENOBUFS;
-
-	gap	 = packets - 1;
-	new_head = av->av_buf_head - packets;
-
-	if (new_head < 0) {
-		if (gap > 0) {
-			memset(av->av_buf, DCCPAV_NOT_RECEIVED,
-			       gap + new_head + 1);
-			gap = -new_head;
-		}
-		new_head += DCCPAV_MAX_ACKVEC_LEN;
-	}
-
-	av->av_buf_head = new_head;
-
-	if (gap > 0)
-		memset(av->av_buf + av->av_buf_head + 1,
-		       DCCPAV_NOT_RECEIVED, gap);
-
-	av->av_buf[av->av_buf_head] = state;
-	av->av_vec_len += packets;
-	return 0;
-}
-
-/*
- * Implements the RFC 4340, Appendix A
- */
-int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
-		    const u64 ackno, const u8 state)
-{
-	u8 *cur_head = av->av_buf + av->av_buf_head,
-	   *buf_end  = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
-	/*
-	 * Check at the right places if the buffer is full, if it is, tell the
-	 * caller to start dropping packets till the HC-Sender acks our ACK
-	 * vectors, when we will free up space in av_buf.
-	 *
-	 * We may well decide to do buffer compression, etc, but for now lets
-	 * just drop.
-	 *
-	 * From Appendix A.1.1 (`New Packets'):
-	 *
-	 *	Of course, the circular buffer may overflow, either when the
-	 *	HC-Sender is sending data at a very high rate, when the
-	 *	HC-Receiver's acknowledgements are not reaching the HC-Sender,
-	 *	or when the HC-Sender is forgetting to acknowledge those acks
-	 *	(so the HC-Receiver is unable to clean up old state). In this
-	 *	case, the HC-Receiver should either compress the buffer (by
-	 *	increasing run lengths when possible), transfer its state to
-	 *	a larger buffer, or, as a last resort, drop all received
-	 *	packets, without processing them whatsoever, until its buffer
-	 *	shrinks again.
-	 */
-
-	/* See if this is the first ackno being inserted */
-	if (av->av_vec_len == 0) {
-		*cur_head = state;
-		av->av_vec_len = 1;
-	} else if (after48(ackno, av->av_buf_ackno)) {
-		const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
-
-		/*
-		 * Look if the state of this packet is the same as the
-		 * previous ackno and if so if we can bump the head len.
-		 */
-		if (delta == 1 && dccp_ackvec_state(cur_head) == state &&
-		    dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN)
-			*cur_head += 1;
-		else if (dccp_ackvec_set_buf_head_state(av, delta, state))
-			return -ENOBUFS;
-	} else {
-		/*
-		 * A.1.2.  Old Packets
-		 *
-		 *	When a packet with Sequence Number S <= buf_ackno
-		 *	arrives, the HC-Receiver will scan the table for
-		 *	the byte corresponding to S. (Indexing structures
-		 *	could reduce the complexity of this scan.)
-		 */
-		u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
-
-		while (1) {
-			const u8 len = dccp_ackvec_runlen(cur_head);
-			/*
-			 * valid packets not yet in av_buf have a reserved
-			 * entry, with a len equal to 0.
-			 */
-			if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) {
-				dccp_pr_debug("Found %llu reserved seat!\n",
-					      (unsigned long long)ackno);
-				*cur_head = state;
-				goto out;
-			}
-			/* len == 0 means one packet */
-			if (delta < len + 1)
-				goto out_duplicate;
-
-			delta -= len + 1;
-			if (++cur_head == buf_end)
-				cur_head = av->av_buf;
-		}
-	}
-
-	av->av_buf_ackno = ackno;
-out:
-	return 0;
-
-out_duplicate:
-	/* Duplicate packet */
-	dccp_pr_debug("Received a dup or already considered lost "
-		      "packet: %llu\n", (unsigned long long)ackno);
-	return -EILSEQ;
-}
-
-static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
-				     struct dccp_ackvec_record *avr)
-{
-	struct dccp_ackvec_record *next;
-
-	/* sort out vector length */
-	if (av->av_buf_head <= avr->avr_ack_ptr)
-		av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
-	else
-		av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 -
-				 av->av_buf_head + avr->avr_ack_ptr;
-
-	/* free records */
-	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
-		list_del(&avr->avr_node);
-		kmem_cache_free(dccp_ackvec_record_slab, avr);
-	}
-}
-
-void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
-				 const u64 ackno)
-{
-	struct dccp_ackvec_record *avr;
-
-	/*
-	 * If we traverse backwards, it should be faster when we have large
-	 * windows. We will be receiving ACKs for stuff we sent a while back
-	 * -sorbo.
-	 */
-	list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
-		if (ackno == avr->avr_ack_seqno) {
-			dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
-				      "ack_ackno=%llu, ACKED!\n",
-				      dccp_role(sk), avr->avr_ack_runlen,
-				      (unsigned long long)avr->avr_ack_seqno,
-				      (unsigned long long)avr->avr_ack_ackno);
-			dccp_ackvec_throw_record(av, avr);
-			break;
-		} else if (avr->avr_ack_seqno > ackno)
-			break; /* old news */
-	}
-}
-
-static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
-					    struct sock *sk, u64 *ackno,
-					    const unsigned char len,
-					    const unsigned char *vector)
-{
-	unsigned char i;
-	struct dccp_ackvec_record *avr;
-
-	/* Check if we actually sent an ACK vector */
-	if (list_empty(&av->av_records))
-		return;
-
-	i = len;
-	/*
-	 * XXX
-	 * I think it might be more efficient to work backwards. See comment on
-	 * rcv_ackno. -sorbo.
-	 */
-	avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
-	while (i--) {
-		const u8 rl = dccp_ackvec_runlen(vector);
-		u64 ackno_end_rl;
-
-		dccp_set_seqno(&ackno_end_rl, *ackno - rl);
-
-		/*
-		 * If our AVR sequence number is greater than the ack, go
-		 * forward in the AVR list until it is not so.
-		 */
-		list_for_each_entry_from(avr, &av->av_records, avr_node) {
-			if (!after48(avr->avr_ack_seqno, *ackno))
-				goto found;
-		}
-		/* End of the av_records list, not found, exit */
-		break;
-found:
-		if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
-			if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) {
-				dccp_pr_debug("%s ACK vector 0, len=%d, "
-					      "ack_seqno=%llu, ack_ackno=%llu, "
-					      "ACKED!\n",
-					      dccp_role(sk), len,
-					      (unsigned long long)
-					      avr->avr_ack_seqno,
-					      (unsigned long long)
-					      avr->avr_ack_ackno);
-				dccp_ackvec_throw_record(av, avr);
-				break;
-			}
-			/*
-			 * If it wasn't received, continue scanning... we might
-			 * find another one.
-			 */
-		}
-
-		dccp_set_seqno(ackno, ackno_end_rl - 1);
-		++vector;
-	}
-}
-
-int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
-		      u64 *ackno, const u8 opt, const u8 *value, const u8 len)
-{
-	if (len > DCCP_SINGLE_OPT_MAXLEN)
-		return -1;
-
-	/* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
-	dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
-					ackno, len, value);
-	return 0;
-}
-
 /**
  * dccp_ackvec_clear_state  -  Perform house-keeping / garbage-collection
  * This routine is called when the peer acknowledges the receipt of Ack Vectors
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 36ca2e9e5c8..db447503b63 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -64,7 +64,6 @@ static inline u8 dccp_ackvec_state(const u8 *cell)
  *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
  * @av_overflow:   if 1 then buf_head == buf_tail indicates buffer wraparound
  * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
- * @av_veclen:	   length of the live portion of @av_buf
  */
 struct dccp_ackvec {
 	u8			av_buf[DCCPAV_MAX_ACKVEC_LEN];
@@ -75,7 +74,6 @@ struct dccp_ackvec {
 	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
 	u8			av_overflow:1;
 	struct list_head	av_records;
-	u16			av_vec_len;
 };
 
 /** struct dccp_ackvec_record - Records information about sent Ack Vectors
@@ -101,25 +99,12 @@ struct dccp_ackvec_record {
 	u8		 avr_ack_nonce:1;
 };
 
-struct sock;
-struct sk_buff;
-
-#ifdef CONFIG_IP_DCCP_ACKVEC
-extern int dccp_ackvec_init(void);
+extern int  dccp_ackvec_init(void);
 extern void dccp_ackvec_exit(void);
 
 extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
 extern void dccp_ackvec_free(struct dccp_ackvec *av);
 
-extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
-			   const u64 ackno, const u8 state);
-
-extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
-					struct sock *sk, const u64 ackno);
-extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
-			     u64 *ackno, const u8 opt,
-			     const u8 *value, const u8 len);
-
 extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
 extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
 extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
@@ -129,66 +114,4 @@ static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
 {
 	return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
 }
-#else /* CONFIG_IP_DCCP_ACKVEC */
-static inline int dccp_ackvec_init(void)
-{
-	return 0;
-}
-
-static inline void dccp_ackvec_exit(void)
-{
-}
-
-static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
-	return NULL;
-}
-
-static inline void dccp_ackvec_free(struct dccp_ackvec *av)
-{
-}
-
-static inline void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
-{
-
-}
-
-static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
-				  const u64 ackno, const u8 state)
-{
-	return -1;
-}
-
-static inline void dccp_ackvec_clear_state(struct dccp_ackvec *av,
-					    const u64 ackno)
-{
-}
-
-static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
-					       struct sock *sk, const u64 ackno)
-{
-}
-
-static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
-				    const u64 *ackno, const u8 opt,
-				    const u8 *value, const u8 len)
-{
-	return -1;
-}
-
-static inline int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 nonce)
-{
-	return -1;
-}
-
-static inline u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
-{
-	return 0;
-}
-
-static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
-{
-	return true;
-}
-#endif /* CONFIG_IP_DCCP_ACKVEC */
 #endif /* _ACKVEC_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 12275943eab..44c7e90248f 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -4,7 +4,6 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
 config IP_DCCP_CCID2
 	tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
 	def_tristate IP_DCCP
-	select IP_DCCP_ACKVEC
 	---help---
 	  CCID 2, TCP-like Congestion Control, denotes Additive Increase,
 	  Multiplicative Decrease (AIMD) congestion control with behavior
-- 
cgit v1.2.3


From c8bf462bc567c3dcb083ff95cc13060dd06f138c Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Separate option parsing from CCID processing

This patch replaces an almost identical replication of code: large parts
of dccp_parse_options() re-appeared as ccid2_ackvector() in ccid2.c.

Apart from the duplication, this caused two more problems:
 1. CCIDs should not need to be concerned with parsing header options;
 2. one can not assume that Ack Vectors appear as a contiguous area within an
    skb, it is legal to insert other options and/or padding in between. The
    current code would throw an error and stop reading in such a case.

The patch provides a new data structure and associated list housekeeping.

Only small changes were necessary to integrate with CCID-2: data structure
initialisation, adapt list traversal routine, and add call to the provided
cleanup routine.

The latter also lead to fixing the following BUG: CCID-2 so far ignored
Ack Vectors on all packets other than Ack/DataAck, which is incorrect,
since Ack Vectors can be present on any packet that has an Ack field.

Details:
--------
 * received Ack Vectors are parsed by dccp_parse_options() alone, which passes
   the result on to the CCID-specific routine ccid_hc_tx_parse_options();
 * CCIDs interested in using/decoding Ack Vector information will add code
   to fetch parsed Ack Vectors via this interface;
 * a data structure, `struct dccp_ackvec_parsed' is provided as interface;
 * this structure arranges Ack Vectors of the same skb into a FIFO order;
 * a doubly-linked list is used to keep the required FIFO code small.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c      |  28 ++++++++++
 net/dccp/ackvec.h      |  19 +++++++
 net/dccp/ccids/ccid2.c | 135 +++++++++++++++----------------------------------
 net/dccp/ccids/ccid2.h |   2 +
 net/dccp/options.c     |  17 ++++---
 5 files changed, 101 insertions(+), 100 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 66b8a51300c..41819848bdd 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -343,6 +343,34 @@ free_records:
 	}
 }
 
+/*
+ *	Routines to keep track of Ack Vectors received in an skb
+ */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
+{
+	struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
+
+	if (new == NULL)
+		return -ENOBUFS;
+	new->vec   = vec;
+	new->len   = len;
+	new->nonce = nonce;
+
+	list_add_tail(&new->node, head);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
+
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
+{
+	struct dccp_ackvec_parsed *cur, *next;
+
+	list_for_each_entry_safe(cur, next, parsed_chunks, node)
+		kfree(cur);
+	INIT_LIST_HEAD(parsed_chunks);
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
+
 int __init dccp_ackvec_init(void)
 {
 	dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index db447503b63..6cdca79a99f 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -114,4 +114,23 @@ static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
 {
 	return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
 }
+
+/**
+ * struct dccp_ackvec_parsed  -  Record offsets of Ack Vectors in skb
+ * @vec:	start of vector (offset into skb)
+ * @len:	length of @vec
+ * @nonce:	whether @vec had an ECN nonce of 0 or 1
+ * @node:	FIFO - arranged in descending order of ack_ackno
+ * This structure is used by CCIDs to access Ack Vectors in a received skb.
+ */
+struct dccp_ackvec_parsed {
+	u8		 *vec,
+			 len,
+			 nonce:1;
+	struct list_head node;
+};
+
+extern int dccp_ackvec_parsed_add(struct list_head *head,
+				  u8 *vec, u8 len, u8 nonce);
+extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
 #endif /* _ACKVEC_H */
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 813d5cd40e8..bbf16b35734 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -317,68 +317,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 #endif
 }
 
-/* XXX Lame code duplication!
- * returns -1 if none was found.
- * else returns the next offset to use in the function call.
- */
-static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
-			   unsigned char **vec, unsigned char *veclen)
-{
-	const struct dccp_hdr *dh = dccp_hdr(skb);
-	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
-	unsigned char *opt_ptr;
-	const unsigned char *opt_end = (unsigned char *)dh +
-					(dh->dccph_doff * 4);
-	unsigned char opt, len;
-	unsigned char *value;
-
-	BUG_ON(offset < 0);
-	options += offset;
-	opt_ptr = options;
-	if (opt_ptr >= opt_end)
-		return -1;
-
-	while (opt_ptr != opt_end) {
-		opt   = *opt_ptr++;
-		len   = 0;
-		value = NULL;
-
-		/* Check if this isn't a single byte option */
-		if (opt > DCCPO_MAX_RESERVED) {
-			if (opt_ptr == opt_end)
-				goto out_invalid_option;
-
-			len = *opt_ptr++;
-			if (len < 3)
-				goto out_invalid_option;
-			/*
-			 * Remove the type and len fields, leaving
-			 * just the value size
-			 */
-			len     -= 2;
-			value   = opt_ptr;
-			opt_ptr += len;
-
-			if (opt_ptr > opt_end)
-				goto out_invalid_option;
-		}
-
-		switch (opt) {
-		case DCCPO_ACK_VECTOR_0:
-		case DCCPO_ACK_VECTOR_1:
-			*vec	= value;
-			*veclen = len;
-			return offset + (opt_ptr - options);
-		}
-	}
-
-	return -1;
-
-out_invalid_option:
-	DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
-	return -1;
-}
-
 static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
@@ -499,15 +437,27 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
 		ccid2_change_l_ack_ratio(sk, hctx->cwnd);
 }
 
+static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+				     u8 option, u8 *optval, u8 optlen)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	switch (option) {
+	case DCCPO_ACK_VECTOR_0:
+	case DCCPO_ACK_VECTOR_1:
+		return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen,
+					      option - DCCPO_ACK_VECTOR_0);
+	}
+	return 0;
+}
+
 static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+	struct dccp_ackvec_parsed *avp;
 	u64 ackno, seqno;
 	struct ccid2_seq *seqp;
-	unsigned char *vector;
-	unsigned char veclen;
-	int offset = 0;
 	int done = 0;
 	unsigned int maxincr = 0;
 
@@ -542,17 +492,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* check forward path congestion */
-	/* still didn't send out new data packets */
-	if (hctx->seqh == hctx->seqt)
+	if (dccp_packet_without_ack(skb))
 		return;
 
-	switch (DCCP_SKB_CB(skb)->dccpd_type) {
-	case DCCP_PKT_ACK:
-	case DCCP_PKT_DATAACK:
-		break;
-	default:
-		return;
-	}
+	/* still didn't send out new data packets */
+	if (hctx->seqh == hctx->seqt)
+		goto done;
 
 	ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
 	if (after48(ackno, hctx->high_ack))
@@ -576,15 +521,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
 
 	/* go through all ack vectors */
-	while ((offset = ccid2_ackvector(sk, skb, offset,
-					 &vector, &veclen)) != -1) {
+	list_for_each_entry(avp, &hctx->av_chunks, node) {
 		/* go through this ack vector */
-		while (veclen--) {
-			u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector));
+		for (; avp->len--; avp->vec++) {
+			u64 ackno_end_rl = SUB48(ackno,
+						 dccp_ackvec_runlen(avp->vec));
 
-			ccid2_pr_debug("ackvec start:%llu end:%llu\n",
+			ccid2_pr_debug("ackvec %llu |%u,%u|\n",
 				       (unsigned long long)ackno,
-				       (unsigned long long)ackno_end_rl);
+				       dccp_ackvec_state(avp->vec) >> 6,
+				       dccp_ackvec_runlen(avp->vec));
 			/* if the seqno we are analyzing is larger than the
 			 * current ackno, then move towards the tail of our
 			 * seqnos.
@@ -603,7 +549,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			 * run length
 			 */
 			while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
-				const u8 state = dccp_ackvec_state(vector);
+				const u8 state = dccp_ackvec_state(avp->vec);
 
 				/* new packet received or marked */
 				if (state != DCCPAV_NOT_RECEIVED &&
@@ -630,7 +576,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				break;
 
 			ackno = SUB48(ackno_end_rl, 1);
-			vector++;
 		}
 		if (done)
 			break;
@@ -694,6 +639,8 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	ccid2_hc_tx_check_sanity(hctx);
+done:
+	dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
 }
 
 static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -727,6 +674,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	hctx->rpdupack  = -1;
 	hctx->last_cong = jiffies;
 	setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
+	INIT_LIST_HEAD(&hctx->av_chunks);
 
 	ccid2_hc_tx_check_sanity(hctx);
 	return 0;
@@ -762,17 +710,18 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 }
 
 static struct ccid_operations ccid2 = {
-	.ccid_id		= DCCPC_CCID2,
-	.ccid_name		= "TCP-like",
-	.ccid_owner		= THIS_MODULE,
-	.ccid_hc_tx_obj_size	= sizeof(struct ccid2_hc_tx_sock),
-	.ccid_hc_tx_init	= ccid2_hc_tx_init,
-	.ccid_hc_tx_exit	= ccid2_hc_tx_exit,
-	.ccid_hc_tx_send_packet	= ccid2_hc_tx_send_packet,
-	.ccid_hc_tx_packet_sent	= ccid2_hc_tx_packet_sent,
-	.ccid_hc_tx_packet_recv	= ccid2_hc_tx_packet_recv,
-	.ccid_hc_rx_obj_size	= sizeof(struct ccid2_hc_rx_sock),
-	.ccid_hc_rx_packet_recv	= ccid2_hc_rx_packet_recv,
+	.ccid_id		  = DCCPC_CCID2,
+	.ccid_name		  = "TCP-like",
+	.ccid_owner		  = THIS_MODULE,
+	.ccid_hc_tx_obj_size	  = sizeof(struct ccid2_hc_tx_sock),
+	.ccid_hc_tx_init	  = ccid2_hc_tx_init,
+	.ccid_hc_tx_exit	  = ccid2_hc_tx_exit,
+	.ccid_hc_tx_send_packet	  = ccid2_hc_tx_send_packet,
+	.ccid_hc_tx_packet_sent	  = ccid2_hc_tx_packet_sent,
+	.ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
+	.ccid_hc_tx_packet_recv	  = ccid2_hc_tx_packet_recv,
+	.ccid_hc_rx_obj_size	  = sizeof(struct ccid2_hc_rx_sock),
+	.ccid_hc_rx_packet_recv	  = ccid2_hc_rx_packet_recv,
 };
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index d7815804bce..907deed255b 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -47,6 +47,7 @@ struct ccid2_seq {
  * @lastrtt: time RTT was last measured
  * @rpseq: last consecutive seqno
  * @rpdupack: dupacks since rpseq
+ * @av_chunks: list of Ack Vectors received on current skb
  */
 struct ccid2_hc_tx_sock {
 	u32			cwnd;
@@ -66,6 +67,7 @@ struct ccid2_hc_tx_sock {
 	int			rpdupack;
 	unsigned long		last_cong;
 	u64			high_ack;
+	struct list_head	av_chunks;
 };
 
 struct ccid2_hc_rx_sock {
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 791e07853a7..e5a32979d7d 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -128,13 +128,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			if (rc)
 				goto out_featneg_failed;
 			break;
-		case DCCPO_ACK_VECTOR_0:
-		case DCCPO_ACK_VECTOR_1:
-			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
-				break;
-			dccp_pr_debug("%s Ack Vector (len=%u)\n", dccp_role(sk),
-				      len);
-			break;
 		case DCCPO_TIMESTAMP:
 			if (len != 4)
 				goto out_invalid_option;
@@ -224,6 +217,16 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 						     pkt_type, opt, value, len))
 				goto out_invalid_option;
 			break;
+		case DCCPO_ACK_VECTOR_0:
+		case DCCPO_ACK_VECTOR_1:
+			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
+				break;
+			/*
+			 * Ack vectors are processed by the TX CCID if it is
+			 * interested. The RX CCID need not parse Ack Vectors,
+			 * since it is only interested in clearing old state.
+			 * Fall through.
+			 */
 		case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
 			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
 						     pkt_type, opt, value, len))
-- 
cgit v1.2.3


From f4a66ca4d2ff093c0f9111b449a248ffb8209b4d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Return-value convention of hc_tx_send_packet()

This patch reorganises the return value convention of the CCID TX sending
function, to permit more flexible schemes, as required by subsequent patches.

Currently the convention is
 * values < 0     mean error,
 * a value == 0   means "send now", and
 * a value x > 0  means "send in x milliseconds".

The patch provides symbolic constants and a function to interpret return values.
In addition, it caps the maximum positive return value to 0xFFFF milliseconds,
corresponding to 65.535 seconds.

This is possible since in CCID-3 the maximum inter-packet gap is t_mbi = 64 sec.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccid.h        | 34 +++++++++++++++++++++++++++++++---
 net/dccp/ccids/ccid3.c | 12 ++++++------
 2 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 430a3c24f97..d27054ba215 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -129,13 +129,41 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
 extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
 extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
 
+/*
+ * Congestion control of queued data packets via CCID decision.
+ *
+ * The TX CCID performs its congestion-control by indicating whether and when a
+ * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
+ * The following modes are supported via the symbolic constants below:
+ * - timer-based pacing    (CCID returns a delay value in milliseconds);
+ * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
+ */
+
+enum ccid_dequeueing_decision {
+	CCID_PACKET_SEND_AT_ONCE =	 0x00000,  /* "green light": no delay */
+	CCID_PACKET_DELAY_MAX =		 0x0FFFF,  /* maximum delay in msecs  */
+	CCID_PACKET_DELAY =		 0x10000,  /* CCID msec-delay mode */
+	CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000,  /* CCID autonomous mode */
+	CCID_PACKET_ERR =		 0xF0000,  /* error condition */
+};
+
+static inline int ccid_packet_dequeue_eval(const int return_code)
+{
+	if (return_code < 0)
+		return CCID_PACKET_ERR;
+	if (return_code == 0)
+		return CCID_PACKET_SEND_AT_ONCE;
+	if (return_code <= CCID_PACKET_DELAY_MAX)
+		return CCID_PACKET_DELAY;
+	return return_code;
+}
+
 static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
 					 struct sk_buff *skb)
 {
-	int rc = 0;
 	if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
-		rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
-	return rc;
+		return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+	return CCID_PACKET_SEND_AT_ONCE;
 }
 
 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 0751a8fa936..0d406f82e43 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -269,11 +269,11 @@ out:
 	sock_put(sk);
 }
 
-/*
- * returns
- *   > 0: delay (in msecs) that should pass before actually sending
- *   = 0: can send immediately
- *   < 0: error condition; do not send packet
+/**
+ * ccid3_hc_tx_send_packet  -  Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
  */
 static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
@@ -349,7 +349,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 
 	/* set the nominal send time for the next following packet */
 	hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi);
-	return 0;
+	return CCID_PACKET_SEND_AT_ONCE;
 }
 
 static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
-- 
cgit v1.2.3


From e7937772d7a2b0127cc4cbc67bc594e139fdaf63 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Extend CCID packet dequeueing interface

This extends the packet dequeuing interface of dccp_write_xmit() to allow
 1. CCIDs to take care of timing when the next packet may be sent;
 2. delayed sending (as before, with an inter-packet gap up to 65.535 seconds).

The main purpose is to take CCID2 out of its polling mode (when it is network-
limited, it tries every millisecond to send, without interruption).
The interface can also be used to support other CCIDs.

The mode of operation for (2) is as follows:
 * new packet is enqueued via dccp_sendmsg() => dccp_write_xmit(),
 * ccid_hc_tx_send_packet() detects that it may not send (e.g. window full),
 * it signals this condition via `CCID_PACKET_WILL_DEQUEUE_LATER',
 * dccp_write_xmit() returns without further action;
 * after some time the wait-condition for CCID becomes true,
 * that CCID schedules the tasklet,
 * tasklet function calls ccid_hc_tx_send_packet() via dccp_write_xmit(),
 * since the wait-condition is now true, ccid_hc_tx_packet() returns "send now",
 * packet is sent, and possibly more (since dccp_write_xmit() loops).

Code reuse: the taskled function calls dccp_write_xmit(), the timer function
            reduces to a wrapper around the same code.

If the tasklet finds that the socket is locked, it re-schedules the tasklet
function (not the tasklet) after one jiffy.

Changed DCCP_BUG to dccp_pr_debug when transmit_skb returns an error (e.g. when a
local qdisc is used, NET_XMIT_DROP=1 can be returned for many packets).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/output.c | 129 ++++++++++++++++++++++++++++++++++--------------------
 net/dccp/timer.c  |  25 ++++++-----
 2 files changed, 95 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/net/dccp/output.c b/net/dccp/output.c
index bfda071559f..9afd58e39e2 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -251,65 +251,98 @@ do_interrupted:
 	goto out;
 }
 
+/**
+ * dccp_xmit_packet  -  Send data packet under control of CCID
+ * Transmits next-queued payload and informs CCID to account for the packet.
+ */
+static void dccp_xmit_packet(struct sock *sk)
+{
+	int err, len;
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue);
+
+	if (unlikely(skb == NULL))
+		return;
+	len = skb->len;
+
+	if (sk->sk_state == DCCP_PARTOPEN) {
+		const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+		/*
+		 * See 8.1.5 - Handshake Completion.
+		 *
+		 * For robustness we resend Confirm options until the client has
+		 * entered OPEN. During the initial feature negotiation, the MPS
+		 * is smaller than usual, reduced by the Change/Confirm options.
+		 */
+		if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+			DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+			dccp_send_ack(sk);
+			dccp_feat_list_purge(&dp->dccps_featneg);
+		}
+
+		inet_csk_schedule_ack(sk);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+					      inet_csk(sk)->icsk_rto,
+					      DCCP_RTO_MAX);
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+	} else if (dccp_ack_pending(sk)) {
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+	} else {
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
+	}
+
+	err = dccp_transmit_skb(sk, skb);
+	if (err)
+		dccp_pr_debug("transmit_skb() returned err=%d\n", err);
+	/*
+	 * Register this one as sent even if an error occurred. To the remote
+	 * end a local packet drop is indistinguishable from network loss, i.e.
+	 * any local drop will eventually be reported via receiver feedback.
+	 */
+	ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+
+	/*
+	 * If the CCID needs to transfer additional header options out-of-band
+	 * (e.g. Ack Vectors or feature-negotiation options), it activates this
+	 * flag to schedule a Sync. The Sync will automatically incorporate all
+	 * currently pending header options, thus clearing the backlog.
+	 */
+	if (dp->dccps_sync_scheduled)
+		dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+}
+
 void dccp_write_xmit(struct sock *sk, int block)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
 
 	while ((skb = skb_peek(&sk->sk_write_queue))) {
-		int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+		int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
 
-		if (err > 0) {
+		switch (ccid_packet_dequeue_eval(rc)) {
+		case CCID_PACKET_WILL_DEQUEUE_LATER:
+			return;
+		case CCID_PACKET_DELAY:
 			if (!block) {
 				sk_reset_timer(sk, &dp->dccps_xmit_timer,
-						msecs_to_jiffies(err)+jiffies);
+						msecs_to_jiffies(rc)+jiffies);
+				return;
+			}
+			rc = dccp_wait_for_ccid(sk, skb, rc);
+			if (rc && rc != -EINTR) {
+				DCCP_BUG("err=%d after dccp_wait_for_ccid", rc);
+				skb_dequeue(&sk->sk_write_queue);
+				kfree_skb(skb);
 				break;
-			} else
-				err = dccp_wait_for_ccid(sk, skb, err);
-			if (err && err != -EINTR)
-				DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
-		}
-
-		skb_dequeue(&sk->sk_write_queue);
-		if (err == 0) {
-			struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-			const int len = skb->len;
-
-			if (sk->sk_state == DCCP_PARTOPEN) {
-				const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
-				/*
-				 * See 8.1.5 - Handshake Completion.
-				 *
-				 * For robustness we resend Confirm options until the client has
-				 * entered OPEN. During the initial feature negotiation, the MPS
-				 * is smaller than usual, reduced by the Change/Confirm options.
-				 */
-				if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
-					DCCP_WARN("Payload too large (%d) for featneg.\n", len);
-					dccp_send_ack(sk);
-					dccp_feat_list_purge(&dp->dccps_featneg);
-				}
-
-				inet_csk_schedule_ack(sk);
-				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-						  inet_csk(sk)->icsk_rto,
-						  DCCP_RTO_MAX);
-				dcb->dccpd_type = DCCP_PKT_DATAACK;
-			} else if (dccp_ack_pending(sk))
-				dcb->dccpd_type = DCCP_PKT_DATAACK;
-			else
-				dcb->dccpd_type = DCCP_PKT_DATA;
-
-			err = dccp_transmit_skb(sk, skb);
-			ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
-			if (err)
-				DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
-					 err);
-			if (dp->dccps_sync_scheduled)
-				dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
-		} else {
-			dccp_pr_debug("packet discarded due to err=%d\n", err);
+			}
+			/* fall through */
+		case CCID_PACKET_SEND_AT_ONCE:
+			dccp_xmit_packet(sk);
+			break;
+		case CCID_PACKET_ERR:
+			skb_dequeue(&sk->sk_write_queue);
 			kfree_skb(skb);
+			dccp_pr_debug("packet discarded due to err=%d\n", rc);
 		}
 	}
 }
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 162d1e683c3..9369aca4b0e 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -237,32 +237,35 @@ out:
 	sock_put(sk);
 }
 
-/* Transmit-delay timer: used by the CCIDs to delay actual send time */
-static void dccp_write_xmit_timer(unsigned long data)
+/**
+ * dccp_write_xmitlet  -  Workhorse for CCID packet dequeueing interface
+ * See the comments above %ccid_dequeueing_decision for supported modes.
+ */
+static void dccp_write_xmitlet(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
-	struct dccp_sock *dp = dccp_sk(sk);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk))
-		sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
+		sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
 	else
 		dccp_write_xmit(sk, 0);
 	bh_unlock_sock(sk);
-	sock_put(sk);
 }
 
-static void dccp_init_write_xmit_timer(struct sock *sk)
+static void dccp_write_xmit_timer(unsigned long data)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-
-	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
-			(unsigned long)sk);
+	dccp_write_xmitlet(data);
+	sock_put((struct sock *)data);
 }
 
 void dccp_init_xmit_timers(struct sock *sk)
 {
-	dccp_init_write_xmit_timer(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+							     (unsigned long)sk);
 	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
 				  &dccp_keepalive_timer);
 }
-- 
cgit v1.2.3


From 146993cf5174472644ed11bd5fb539f0af8bfa49 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Refine the wait-for-ccid mechanism

This extends the existing wait-for-ccid routine so that it may be used with
different types of CCID. It further addresses the problems listed below.

The code looks if the write queue is non-empty and grants the TX CCID up to
`timeout' jiffies to drain the queue. It will instead purge that queue if
 * the delay suggested by the CCID exceeds the time budget;
 * a socket error occurred while waiting for the CCID;
 * there is a signal pending (eg. annoyed user pressed Control-C);
 * the CCID does not support delays (we don't know how long it will take).


                 D e t a i l s  [can be removed]
                 -------------------------------
DCCP's sending mechanism functions a bit like non-blocking I/O: dccp_sendmsg()
will enqueue up to net.dccp.default.tx_qlen packets (default=5), without waiting
for them to be released to the network.

Rate-based CCIDs, such as CCID3/4, can impose sending delays of up to maximally
64 seconds (t_mbi in RFC 3448). Hence the write queue may still contain packets
when the application closes. Since the write queue is congestion-controlled by
the CCID, draining the queue is also under control of the CCID.

There are several problems that needed to be addressed:
 1) The queue-drain mechanism only works with rate-based CCIDs. If CCID2 for
    example has a full TX queue and becomes network-limited just as the
    application wants to close, then waiting for CCID2 to become unblocked could
    lead to an indefinite  delay (i.e., application "hangs").
 2) Since each TX CCID in turn uses a feedback mechanism, there may be changes
    in its sending policy while the queue is being drained. This can lead to
    further delays during which the application will not be able to terminate.
 3) The minimum wait time for CCID3/4 can be expected to be the queue length
    times the current inter-packet delay. For example if tx_qlen=100 and a delay
    of 15 ms is used for each packet, then the application would have to wait
    for a minimum of 1.5 seconds before being allowed to exit.
 4) There is no way for the user/application to control this behaviour. It would
    be good to use the timeout argument of dccp_close() as an upper bound. Then
    the maximum time that an application is willing to wait for its CCIDs to can
    be set via the SO_LINGER option.

These problems are addressed by giving the CCID a grace period of up to the
`timeout' value.

The wait-for-ccid function is, as before, used when the application
 (a) has read all the data in its receive buffer and
 (b) if SO_LINGER was set with a non-zero linger time, or
 (c) the socket is either in the OPEN (active close) or in the PASSIVE_CLOSEREQ
     state (client application closes after receiving CloseReq).

In addition, there is a catch-all case by calling __skb_queue_purge() after
waiting for the CCID. This is necessary since the write queue may still have
data when
 (a) the host has been passively-closed,
 (b) abnormal termination (unread data, zero linger time),
 (c) wait-for-ccid could not finish within the given time limit.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/dccp.h   |   3 +-
 net/dccp/output.c | 115 ++++++++++++++++++++++++++++++------------------------
 net/dccp/proto.c  |  15 ++++++-
 net/dccp/timer.c  |   2 +-
 4 files changed, 82 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 1e65378eea3..74c90cd2767 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -234,8 +234,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
 			   const enum dccp_pkt_type pkt_type);
 
-extern void dccp_write_xmit(struct sock *sk, int block);
+extern void dccp_write_xmit(struct sock *sk);
 extern void dccp_write_space(struct sock *sk);
+extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
 
 extern void dccp_init_xmit_timers(struct sock *sk);
 static inline void dccp_clear_xmit_timers(struct sock *sk)
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 9afd58e39e2..39056dc6135 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -206,49 +206,29 @@ void dccp_write_space(struct sock *sk)
 }
 
 /**
- * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * dccp_wait_for_ccid  -  Await CCID send permission
  * @sk:    socket to wait for
- * @skb:   current skb to pass on for waiting
- * @delay: sleep timeout in milliseconds (> 0)
- * This function is called by default when the socket is closed, and
- * when a non-zero linger time is set on the socket. For consistency
+ * @delay: timeout in jiffies
+ * This is used by CCIDs which need to delay the send time in process context.
  */
-static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
 	DEFINE_WAIT(wait);
-	unsigned long jiffdelay;
-	int rc;
-
-	do {
-		dccp_pr_debug("delayed send by %d msec\n", delay);
-		jiffdelay = msecs_to_jiffies(delay);
-
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	long remaining;
 
-		sk->sk_write_pending++;
-		release_sock(sk);
-		schedule_timeout(jiffdelay);
-		lock_sock(sk);
-		sk->sk_write_pending--;
+	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	sk->sk_write_pending++;
+	release_sock(sk);
 
-		if (sk->sk_err)
-			goto do_error;
-		if (signal_pending(current))
-			goto do_interrupted;
+	remaining = schedule_timeout(delay);
 
-		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-	} while ((delay = rc) > 0);
-out:
+	lock_sock(sk);
+	sk->sk_write_pending--;
 	finish_wait(sk->sk_sleep, &wait);
-	return rc;
-
-do_error:
-	rc = -EPIPE;
-	goto out;
-do_interrupted:
-	rc = -EINTR;
-	goto out;
+
+	if (signal_pending(current) || sk->sk_err)
+		return -1;
+	return remaining;
 }
 
 /**
@@ -311,7 +291,53 @@ static void dccp_xmit_packet(struct sock *sk)
 		dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
 }
 
-void dccp_write_xmit(struct sock *sk, int block)
+/**
+ * dccp_flush_write_queue  -  Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+	long delay, rc;
+
+	while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
+		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+		switch (ccid_packet_dequeue_eval(rc)) {
+		case CCID_PACKET_WILL_DEQUEUE_LATER:
+			/*
+			 * If the CCID determines when to send, the next sending
+			 * time is unknown or the CCID may not even send again
+			 * (e.g. remote host crashes or lost Ack packets).
+			 */
+			DCCP_WARN("CCID did not manage to send all packets\n");
+			return;
+		case CCID_PACKET_DELAY:
+			delay = msecs_to_jiffies(rc);
+			if (delay > *time_budget)
+				return;
+			rc = dccp_wait_for_ccid(sk, delay);
+			if (rc < 0)
+				return;
+			*time_budget -= (delay - rc);
+			/* check again if we can send now */
+			break;
+		case CCID_PACKET_SEND_AT_ONCE:
+			dccp_xmit_packet(sk);
+			break;
+		case CCID_PACKET_ERR:
+			skb_dequeue(&sk->sk_write_queue);
+			kfree_skb(skb);
+			dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+		}
+	}
+}
+
+void dccp_write_xmit(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
@@ -323,19 +349,9 @@ void dccp_write_xmit(struct sock *sk, int block)
 		case CCID_PACKET_WILL_DEQUEUE_LATER:
 			return;
 		case CCID_PACKET_DELAY:
-			if (!block) {
-				sk_reset_timer(sk, &dp->dccps_xmit_timer,
-						msecs_to_jiffies(rc)+jiffies);
-				return;
-			}
-			rc = dccp_wait_for_ccid(sk, skb, rc);
-			if (rc && rc != -EINTR) {
-				DCCP_BUG("err=%d after dccp_wait_for_ccid", rc);
-				skb_dequeue(&sk->sk_write_queue);
-				kfree_skb(skb);
-				break;
-			}
-			/* fall through */
+			sk_reset_timer(sk, &dp->dccps_xmit_timer,
+				       jiffies + msecs_to_jiffies(rc));
+			return;
 		case CCID_PACKET_SEND_AT_ONCE:
 			dccp_xmit_packet(sk);
 			break;
@@ -660,7 +676,6 @@ void dccp_send_close(struct sock *sk, const int active)
 		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
 
 	if (active) {
-		dccp_write_xmit(sk, 1);
 		dccp_skb_entail(sk, skb);
 		dccp_transmit_skb(sk, skb_clone(skb, prio));
 		/*
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 11905e0cf8f..8c125ffab1c 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -735,7 +735,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		goto out_discard;
 
 	skb_queue_tail(&sk->sk_write_queue, skb);
-	dccp_write_xmit(sk,0);
+	dccp_write_xmit(sk);
 out_release:
 	release_sock(sk);
 	return rc ? : len;
@@ -958,9 +958,22 @@ void dccp_close(struct sock *sk, long timeout)
 		/* Check zero linger _after_ checking for unread data. */
 		sk->sk_prot->disconnect(sk, 0);
 	} else if (sk->sk_state != DCCP_CLOSED) {
+		/*
+		 * Normal connection termination. May need to wait if there are
+		 * still packets in the TX queue that are delayed by the CCID.
+		 */
+		dccp_flush_write_queue(sk, &timeout);
 		dccp_terminate_connection(sk);
 	}
 
+	/*
+	 * Flush write queue. This may be necessary in several cases:
+	 * - we have been closed by the peer but still have application data;
+	 * - abortive termination (unread data or zero linger time),
+	 * - normal termination but queue could not be flushed within time limit
+	 */
+	__skb_queue_purge(&sk->sk_write_queue);
+
 	sk_stream_wait_close(sk, timeout);
 
 adjudge_to_death:
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 9369aca4b0e..e02d5a94f4c 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -249,7 +249,7 @@ static void dccp_write_xmitlet(unsigned long data)
 	if (sock_owned_by_user(sk))
 		sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
 	else
-		dccp_write_xmit(sk, 0);
+		dccp_write_xmit(sk);
 	bh_unlock_sock(sk);
 }
 
-- 
cgit v1.2.3


From 83337dae6ca94d801b6700600244865cd694205b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Stop polling

This updates CCID2 to use the CCID dequeuing mechanism, converting from
previous constant-polling to a now event-driven mechanism.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c | 21 +++++++++++++--------
 net/dccp/ccids/ccid2.h |  5 +++++
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index bbf16b35734..c7d83e3c164 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -123,12 +123,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
 
 static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
-	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
-	if (hctx->pipe < hctx->cwnd)
-		return 0;
-
-	return 1; /* XXX CCID should dequeue when ready instead of polling */
+	if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
+		return CCID_PACKET_WILL_DEQUEUE_LATER;
+	return CCID_PACKET_SEND_AT_ONCE;
 }
 
 static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
@@ -168,6 +165,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+	const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
 	long s;
 
 	bh_lock_sock(sk);
@@ -187,8 +185,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 	if (s > 60)
 		hctx->rto = 60 * HZ;
 
-	ccid2_start_rto_timer(sk);
-
 	/* adjust pipe, cwnd etc */
 	hctx->ssthresh = hctx->cwnd / 2;
 	if (hctx->ssthresh < 2)
@@ -205,6 +201,11 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 	hctx->rpdupack = -1;
 	ccid2_change_l_ack_ratio(sk, 1);
 	ccid2_hc_tx_check_sanity(hctx);
+
+	/* if we were blocked before, we may now send cwnd=1 packet */
+	if (sender_was_blocked)
+		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+	ccid2_start_rto_timer(sk);
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
@@ -455,6 +456,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+	const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
 	struct dccp_ackvec_parsed *avp;
 	u64 ackno, seqno;
 	struct ccid2_seq *seqp;
@@ -640,6 +642,9 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
 	ccid2_hc_tx_check_sanity(hctx);
 done:
+	/* check if incoming Acks allow pending packets to be sent */
+	if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
+		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
 	dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
 }
 
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 907deed255b..d5900dd5d4f 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -70,6 +70,11 @@ struct ccid2_hc_tx_sock {
 	struct list_head	av_chunks;
 };
 
+static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
+{
+	return (hctx->pipe >= hctx->cwnd);
+}
+
 struct ccid2_hc_rx_sock {
 	int			data;
 };
-- 
cgit v1.2.3


From c6f0f2e71f3088a0f05502d6adb0f667b84028c3 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Remove redundant sanity tests

This removes the ccid2_hc_tx_check_sanity function: it is redundant.

Details:
========
The tx_check_sanity function performs three tests:
 1) it checks that the circular TX list is sorted
    - in ascending order of sequence number (ccid2s_seq)
    - and time (ccid2s_sent),
    - in the direction from `tail' (hctx_seqt) to `head' (hctx_seqh);
 2) it ensures that the entire list has the length seqbufc * CCID2_SEQBUF_LEN;
 3) it ensures that pipe equals the number of packets that were not
    marked `acked' (ccid2s_acked) between `tail' and `head'.

The following argues that each of these tests is redundant, this can be verified
by going through the code.

(1) is not necessary, since both time and GSS increase from one packet to the
next, so that subsequent insertions in tx_packet_sent (which advance the `head'
pointer) will be in ascending order of time and sequence number.

In (2), the length of the list is always equal to seqbufc times CCID2_SEQBUF_LEN
(set to 1024) unless allocation caused an earlier failure, because:
 * at initialisation (tx_init), there is one chunk of size 1024 and seqbufc=1;
 * subsequent calls to tx_alloc_seq take place whenever head->next == tail in
   tx_packet_sent; then a new chunk of size 1024 is inserted between head and
   tail, and seqbufc is incremented by one.

To show that (3) is redundant requires looking at two cases.

The `pipe' variable of the TX socket is incremented only in tx_packet_sent, and
decremented in tx_packet_recv.  When head == tail (TX history empty) then pipe
should be 0, which is the case directly after initialisation and after a
retransmission timeout has occurred (ccid2_hc_tx_rto_expire).

The first case involves parsing Ack Vectors for packets recorded in the live
portion of the buffer, between tail and head. For each packet marked by the
receiver as received (state 0) or ECN-marked (state 1), pipe is decremented by
one, so for all such packets the BUG_ON in tx_check_sanity will not trigger.

The second case is the loss detection in the second half of tx_packet_recv,
below the comment "Check for NUMDUPACK".

The first while-loop here ensures that the sequence number of `seqp' is either
above or equal to `high_ack', or otherwise equal to the highest sequence number
sent so far (of the entry head->prev, as head points to the next unsent entry).
The next while-loop ("while (1)") counts the number of acked packets starting
from that position of seqp, going backwards in the direction from head->prev to
tail. If NUMDUPACK=3 such packets were counted within this loop, `seqp' points
to the last acknowledged packet of these, and the "if (done == NUMDUPACK)" block
is entered next.
The while-loop contained within that block in turn traverses the list backwards,
from head to tail; the position of `seqp' is saved in the variable `last_acked'.
For each packet not marked as `acked', a congestion event is triggered within
the loop, and pipe is decremented. The loop terminates when `seqp' has reached
`tail', whereupon tail is set to the position previously stored in `last_acked'.
Thus, between `last_acked' and the previous position of `tail',
 - pipe has been decremented earlier if the packet was marked as state 0 or 1;
 - pipe was decremented if the packet was not marked as acked.
That is, pipe has been decremented by the number of packets between `last_acked'
and the previous position of `tail'. As a consequence, pipe now again reflects
the number of packets which have not (yet) been acked between the new position
of tail (at `last_acked') and head->prev, or 0 if head==tail. The result is that
the BUG_ON condition in check_sanity will also not be triggered, hence the test
(3) is also redundant.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c | 51 --------------------------------------------------
 1 file changed, 51 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index c7d83e3c164..3b2548bd73f 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -34,51 +34,8 @@
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 static int ccid2_debug;
 #define ccid2_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-
-static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
-{
-	int len = 0;
-	int pipe = 0;
-	struct ccid2_seq *seqp = hctx->seqh;
-
-	/* there is data in the chain */
-	if (seqp != hctx->seqt) {
-		seqp = seqp->ccid2s_prev;
-		len++;
-		if (!seqp->ccid2s_acked)
-			pipe++;
-
-		while (seqp != hctx->seqt) {
-			struct ccid2_seq *prev = seqp->ccid2s_prev;
-
-			len++;
-			if (!prev->ccid2s_acked)
-				pipe++;
-
-			/* packets are sent sequentially */
-			BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
-						prev->ccid2s_seq ) >= 0);
-			BUG_ON(time_before(seqp->ccid2s_sent,
-					   prev->ccid2s_sent));
-
-			seqp = prev;
-		}
-	}
-
-	BUG_ON(pipe != hctx->pipe);
-	ccid2_pr_debug("len of chain=%d\n", len);
-
-	do {
-		seqp = seqp->ccid2s_prev;
-		len++;
-	} while (seqp != hctx->seqh);
-
-	ccid2_pr_debug("total len=%d\n", len);
-	BUG_ON(len != hctx->seqbufc * CCID2_SEQBUF_LEN);
-}
 #else
 #define ccid2_pr_debug(format, a...)
-#define ccid2_hc_tx_check_sanity(hctx)
 #endif
 
 static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -176,8 +133,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 
 	ccid2_pr_debug("RTO_EXPIRE\n");
 
-	ccid2_hc_tx_check_sanity(hctx);
-
 	/* back-off timer */
 	hctx->rto <<= 1;
 
@@ -200,7 +155,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 	hctx->rpseq    = 0;
 	hctx->rpdupack = -1;
 	ccid2_change_l_ack_ratio(sk, 1);
-	ccid2_hc_tx_check_sanity(hctx);
 
 	/* if we were blocked before, we may now send cwnd=1 packet */
 	if (sender_was_blocked)
@@ -314,7 +268,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 		}
 	} while (0);
 	ccid2_pr_debug("=========\n");
-	ccid2_hc_tx_check_sanity(hctx);
 #endif
 }
 
@@ -463,7 +416,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	int done = 0;
 	unsigned int maxincr = 0;
 
-	ccid2_hc_tx_check_sanity(hctx);
 	/* check reverse path congestion */
 	seqno = DCCP_SKB_CB(skb)->dccpd_seq;
 
@@ -640,7 +592,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		hctx->seqt = hctx->seqt->ccid2s_next;
 	}
 
-	ccid2_hc_tx_check_sanity(hctx);
 done:
 	/* check if incoming Acks allow pending packets to be sent */
 	if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
@@ -680,8 +631,6 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	hctx->last_cong = jiffies;
 	setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
 	INIT_LIST_HEAD(&hctx->av_chunks);
-
-	ccid2_hc_tx_check_sanity(hctx);
 	return 0;
 }
 
-- 
cgit v1.2.3


From e9803c0104564698d3b8e84ccdb0b8b0e65427e2 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Simplify dec_pipe and rearming of RTO timer

This removes the dec_pipe function and improves the way the RTO timer is rearmed
when a new acknowledgment comes in.

Details and justification for removal:
--------------------------------------
 1) The BUG_ON in dec_pipe is never triggered: pipe is only decremented for TX
    history entries between tail and head, for which it had previously been
    incremented in tx_packet_sent; and it is not decremented twice for the same
    entry, since it is
    - either decremented when a corresponding Ack Vector cell in state 0 or 1
      was received (and then ccid2s_acked==1),
    - or it is decremented when ccid2s_acked==0, as part of the loss detection
      in tx_packet_recv (and hence it can not have been decremented earlier).

 2) Restarting the RTO timer happens for every single entry in each Ack Vector
    parsed by tx_packet_recv (according to RFC 4340, 11.4 this can happen up to
    16192 times per Ack Vector).

 3) The RTO timer should not be restarted when all outstanding data has been
    acknowledged. This is currently done similar to (2), in dec_pipe, when
    pipe has reached 0.

The patch onsolidates the code which rearms the RTO timer, combining the
segments from new_ack and dec_pipe. As a result, the code becomes clearer
(compare with tcp_rearm_rto()).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 3b2548bd73f..fa074d44206 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -353,23 +353,6 @@ static inline void ccid2_new_ack(struct sock *sk,
 			       hctx->srtt, hctx->rttvar,
 			       hctx->rto, HZ, r);
 	}
-
-	/* we got a new ack, so re-start RTO timer */
-	ccid2_hc_tx_kill_rto_timer(sk);
-	ccid2_start_rto_timer(sk);
-}
-
-static void ccid2_hc_tx_dec_pipe(struct sock *sk)
-{
-	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
-	if (hctx->pipe == 0)
-		DCCP_BUG("pipe == 0");
-	else
-		hctx->pipe--;
-
-	if (hctx->pipe == 0)
-		ccid2_hc_tx_kill_rto_timer(sk);
 }
 
 static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
@@ -518,7 +501,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 					seqp->ccid2s_acked = 1;
 					ccid2_pr_debug("Got ack for %llu\n",
 						       (unsigned long long)seqp->ccid2s_seq);
-					ccid2_hc_tx_dec_pipe(sk);
+					hctx->pipe--;
 				}
 				if (seqp == hctx->seqt) {
 					done = 1;
@@ -574,7 +557,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				 * one ack vector.
 				 */
 				ccid2_congestion_event(sk, seqp);
-				ccid2_hc_tx_dec_pipe(sk);
+				hctx->pipe--;
 			}
 			if (seqp == hctx->seqt)
 				break;
@@ -592,6 +575,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		hctx->seqt = hctx->seqt->ccid2s_next;
 	}
 
+	/* restart RTO timer if not all outstanding data has been acked */
+	if (hctx->pipe == 0)
+		sk_stop_timer(sk, &hctx->rtotimer);
+	else
+		sk_reset_timer(sk, &hctx->rtotimer,
+			       jiffies + hctx->rto);
 done:
 	/* check if incoming Acks allow pending packets to be sent */
 	if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
-- 
cgit v1.2.3


From 1435562d7e0412e4885b661843f69859013f9d25 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Replace broken RTT estimator with better algorithm

The current CCID-2 RTT estimator code is in parts broken and lags behind the
suggestions in RFC2988 of using scaled variants for SRTT/RTTVAR.
That code is replaced by the present patch, which reuses the Linux TCP RTT
estimator code - reasons for this code duplication are given below.

Further details:
----------------
 1. The minimum RTO of previously one second has been replaced with TCP's, since
    RFC4341, sec. 5 says that the minimum of 1 sec. (suggested in RFC2988, 2.4)
    is not necessary. Instead, the TCP_RTO_MIN is used, which agrees with DCCP's
    concept of a default RTT (RFC 4340, 3.4).
 2. The maximum RTO has been set to DCCP_RTO_MAX (64 sec), which agrees with
    RFC2988, (2.5).
 3. De-inlined the function ccid2_new_ack().
 4. Added a FIXME: the RTT is sampled several times per Ack Vector, which will
    give the wrong estimate. It should be replaced with one sample per Ack.
    However, at the moment this can not be resolved easily, since
    - it depends on TX history code (which also needs some work),
    - the cleanest solution is not to use the `sent' time at all (saves 4 bytes
      per entry) and use DCCP timestamps / elapsed time to estimated the RTT,
      which however is non-trivial to get right (but needs to be done).

Reasons for reusing the Linux TCP estimator algorithm:
------------------------------------------------------
Some time was spent to find a better alternative, using basic RFC2988 as a first
step. Further analysis and experimentation showed that the Linux TCP RTO
estimator is superior to a basic RFC2988 implementation. A summary is on
http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid2/rto_estimator/

In addition, this estimator fared well in a recent empirical evaluation:

    Rewaskar, Sushant, Jasleen Kaur and F. Donelson Smith.
    A Performance Study of Loss Detection/Recovery in Real-world TCP
    Implementations. Proceedings of 15th IEEE International
    Conference on Network Protocols (ICNP-07). 2007.

Thus there is significant benefit in reusing the existing TCP code.


Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c | 171 +++++++++++++++++++++++++++----------------------
 net/dccp/ccids/ccid2.h |  18 ++++--
 2 files changed, 107 insertions(+), 82 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index fa074d44206..22753fd9869 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -110,12 +110,6 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 	dp->dccps_l_ack_ratio = val;
 }
 
-static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
-{
-	ccid2_pr_debug("change SRTT to %ld\n", val);
-	hctx->srtt = val;
-}
-
 static void ccid2_start_rto_timer(struct sock *sk);
 
 static void ccid2_hc_tx_rto_expire(unsigned long data)
@@ -123,7 +117,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 	struct sock *sk = (struct sock *)data;
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 	const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
-	long s;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
@@ -135,10 +128,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 
 	/* back-off timer */
 	hctx->rto <<= 1;
-
-	s = hctx->rto / HZ;
-	if (s > 60)
-		hctx->rto = 60 * HZ;
+	if (hctx->rto > DCCP_RTO_MAX)
+		hctx->rto = DCCP_RTO_MAX;
 
 	/* adjust pipe, cwnd etc */
 	hctx->ssthresh = hctx->cwnd / 2;
@@ -279,9 +270,87 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
 	ccid2_pr_debug("deleted RTO timer\n");
 }
 
-static inline void ccid2_new_ack(struct sock *sk,
-				 struct ccid2_seq *seqp,
-				 unsigned int *maxincr)
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ *   which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
+ */
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+	long m = mrtt ? : 1;
+
+	if (hctx->srtt == 0) {
+		/* First measurement m */
+		hctx->srtt = m << 3;
+		hctx->mdev = m << 1;
+
+		hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev);
+		hctx->rttvar   = hctx->mdev_max;
+		hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
+	} else {
+		/* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+		m -= (hctx->srtt >> 3);
+		hctx->srtt += m;
+
+		/* Similarly, update scaled mdev with regard to |m| */
+		if (m < 0) {
+			m = -m;
+			m -= (hctx->mdev >> 2);
+			/*
+			 * This neutralises RTO increase when RTT < SRTT - mdev
+			 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+			 * in Linux TCP", USENIX 2002, pp. 49-62).
+			 */
+			if (m > 0)
+				m >>= 3;
+		} else {
+			m -= (hctx->mdev >> 2);
+		}
+		hctx->mdev += m;
+
+		if (hctx->mdev > hctx->mdev_max) {
+			hctx->mdev_max = hctx->mdev;
+			if (hctx->mdev_max > hctx->rttvar)
+				hctx->rttvar = hctx->mdev_max;
+		}
+
+		/*
+		 * Decay RTTVAR at most once per flight, exploiting that
+		 *  1) pipe <= cwnd <= Sequence_Window = W  (RFC 4340, 7.5.2)
+		 *  2) AWL = GSS-W+1 <= GAR <= GSS          (RFC 4340, 7.5.1)
+		 * GAR is a useful bound for FlightSize = pipe, AWL is probably
+		 * too low as it over-estimates pipe.
+		 */
+		if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
+			if (hctx->mdev_max < hctx->rttvar)
+				hctx->rttvar -= (hctx->rttvar -
+						 hctx->mdev_max) >> 2;
+			hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
+			hctx->mdev_max = TCP_RTO_MIN;
+		}
+	}
+
+	/*
+	 * Set RTO from SRTT and RTTVAR
+	 * Clock granularity is ignored since the minimum error for RTTVAR is
+	 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
+	 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
+	 * does not retransmit data, DCCP does not require TCP's recommended
+	 * minimum timeout of one second".
+	 */
+	hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
+
+	if (hctx->rto > DCCP_RTO_MAX)
+		hctx->rto = DCCP_RTO_MAX;
+}
+
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+			  unsigned int *maxincr)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
@@ -295,64 +364,15 @@ static inline void ccid2_new_ack(struct sock *sk,
 			hctx->cwnd += 1;
 			hctx->packets_acked = 0;
 	}
-
-	/* update RTO */
-	if (hctx->srtt == -1 ||
-	    time_after(jiffies, hctx->lastrtt + hctx->srtt)) {
-		unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
-		int s;
-
-		/* first measurement */
-		if (hctx->srtt == -1) {
-			ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
-				       r, jiffies,
-				       (unsigned long long)seqp->ccid2s_seq);
-			ccid2_change_srtt(hctx, r);
-			hctx->rttvar = r >> 1;
-		} else {
-			/* RTTVAR */
-			long tmp = hctx->srtt - r;
-			long srtt;
-
-			if (tmp < 0)
-				tmp *= -1;
-
-			tmp >>= 2;
-			hctx->rttvar *= 3;
-			hctx->rttvar >>= 2;
-			hctx->rttvar += tmp;
-
-			/* SRTT */
-			srtt = hctx->srtt;
-			srtt *= 7;
-			srtt >>= 3;
-			tmp = r >> 3;
-			srtt += tmp;
-			ccid2_change_srtt(hctx, srtt);
-		}
-		s = hctx->rttvar << 2;
-		/* clock granularity is 1 when based on jiffies */
-		if (!s)
-			s = 1;
-		hctx->rto = hctx->srtt + s;
-
-		/* must be at least a second */
-		s = hctx->rto / HZ;
-		/* DCCP doesn't require this [but I like it cuz my code sux] */
-#if 1
-		if (s < 1)
-			hctx->rto = HZ;
-#endif
-		/* max 60 seconds */
-		if (s > 60)
-			hctx->rto = HZ * 60;
-
-		hctx->lastrtt = jiffies;
-
-		ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
-			       hctx->srtt, hctx->rttvar,
-			       hctx->rto, HZ, r);
-	}
+	/*
+	 * FIXME: RTT is sampled several times per acknowledgment (for each
+	 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+	 * This causes the RTT to be over-estimated, since the older entries
+	 * in the Ack Vector have earlier sending times.
+	 * The cleanest solution is to not use the ccid2s_sent field at all
+	 * and instead use DCCP timestamps - need to be resolved at some time.
+	 */
+	ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
 }
 
 static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
@@ -579,8 +599,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	if (hctx->pipe == 0)
 		sk_stop_timer(sk, &hctx->rtotimer);
 	else
-		sk_reset_timer(sk, &hctx->rtotimer,
-			       jiffies + hctx->rto);
+		sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
 done:
 	/* check if incoming Acks allow pending packets to be sent */
 	if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
@@ -613,9 +632,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	if (ccid2_hc_tx_alloc_seq(hctx))
 		return -ENOMEM;
 
-	hctx->rto	 = 3 * HZ;
-	ccid2_change_srtt(hctx, -1);
-	hctx->rttvar	= -1;
+	hctx->rto	= DCCP_TIMEOUT_INIT;
 	hctx->rpdupack  = -1;
 	hctx->last_cong = jiffies;
 	setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index d5900dd5d4f..8b7a2dee2f6 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -44,7 +44,12 @@ struct ccid2_seq {
  *
  * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
  * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
- * @lastrtt: time RTT was last measured
+ * @srtt: smoothed RTT estimate, scaled by 2^3
+ * @mdev: smoothed RTT variation, scaled by 2^2
+ * @mdev_max: maximum of @mdev during one flight
+ * @rttvar: moving average/maximum of @mdev_max
+ * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @rtt_seq: to decay RTTVAR at most once per flight
  * @rpseq: last consecutive seqno
  * @rpdupack: dupacks since rpseq
  * @av_chunks: list of Ack Vectors received on current skb
@@ -58,10 +63,13 @@ struct ccid2_hc_tx_sock {
 	int			seqbufc;
 	struct ccid2_seq	*seqh;
 	struct ccid2_seq	*seqt;
-	long			rto;
-	long			srtt;
-	long			rttvar;
-	unsigned long		lastrtt;
+	/* RTT measurement: variables/principles are the same as in TCP */
+	u32			srtt,
+				mdev,
+				mdev_max,
+				rttvar,
+				rto;
+	u64			rtt_seq:48;
 	struct timer_list	rtotimer;
 	u64			rpseq;
 	int			rpdupack;
-- 
cgit v1.2.3


From 20bbd0f75ee4b72c1dafc8e5fb6ad39ba506a75c Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Remove wrappers around sk_{reset,stop}_timer()

This removes the wrappers around the sk timer functions as it makes the code
clearer and not much is gained from using wrappers: the BUG_ON in
start_rto_timer will never trigger since that function was called only when
 * the RTO timer expired (rto_expire, and then timer_pending() is false);
 * in tx_packet_sent only if !timer_pending() (BUG_ON is redundant here);
 * previously in new_ack, after stopping the timer (timer_pending() false).

One further motive behind this patch is to replace the RTO timer with the
icsk retransmission timer, as it is already part of the DCCP socket.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 22753fd9869..c539f79ab8e 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -110,8 +110,6 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 	dp->dccps_l_ack_ratio = val;
 }
 
-static void ccid2_start_rto_timer(struct sock *sk);
-
 static void ccid2_hc_tx_rto_expire(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
@@ -150,23 +148,13 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 	/* if we were blocked before, we may now send cwnd=1 packet */
 	if (sender_was_blocked)
 		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
-	ccid2_start_rto_timer(sk);
+	/* restart backed-off timer */
+	sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
 
-static void ccid2_start_rto_timer(struct sock *sk)
-{
-	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
-	ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->rto);
-
-	BUG_ON(timer_pending(&hctx->rtotimer));
-	sk_reset_timer(sk, &hctx->rtotimer,
-		       jiffies + hctx->rto);
-}
-
 static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -245,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 
 	/* setup RTO timer */
 	if (!timer_pending(&hctx->rtotimer))
-		ccid2_start_rto_timer(sk);
+		sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 	do {
@@ -262,14 +250,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 #endif
 }
 
-static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
-{
-	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
-	sk_stop_timer(sk, &hctx->rtotimer);
-	ccid2_pr_debug("deleted RTO timer\n");
-}
-
 /**
  * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
  * This code is almost identical with TCP's tcp_rtt_estimator(), since
@@ -645,7 +625,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 	int i;
 
-	ccid2_hc_tx_kill_rto_timer(sk);
+	sk_stop_timer(sk, &hctx->rtotimer);
 
 	for (i = 0; i < hctx->seqbufc; i++)
 		kfree(hctx->seqbuf[i]);
-- 
cgit v1.2.3


From b25b0c60b0c39a82bc651aeb6443bcb36cd17f76 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Combine the functionality of enqeueing and cloning

Realising the following call pattern,
 * first dccp_entail() is called to enqueue a new skb and
 * then skb_clone() is called to transmit a clone of that skb,

this patch integrates both interrelated steps into dccp_entail().

Note: the return value of skb_clone is not checked. It may be an idea to add a
      warning if this occurs. In both instances, however, a timer is set for
      retransmission, so that cloning is re-tried via dccp_retransmit_skb().

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/output.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dccp/output.c b/net/dccp/output.c
index 39056dc6135..b1eaf7bcfb1 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -26,11 +26,13 @@ static inline void dccp_event_ack_sent(struct sock *sk)
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
-static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
+static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
 {
 	skb_set_owner_w(skb, sk);
 	WARN_ON(sk->sk_send_head);
 	sk->sk_send_head = skb;
+	return skb_clone(sk->sk_send_head, gfp_any());
 }
 
 /*
@@ -550,8 +552,7 @@ int dccp_connect(struct sock *sk)
 
 	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
 
-	dccp_skb_entail(sk, skb);
-	dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+	dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
 	DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the REQUEST until an answer. */
@@ -676,8 +677,7 @@ void dccp_send_close(struct sock *sk, const int active)
 		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
 
 	if (active) {
-		dccp_skb_entail(sk, skb);
-		dccp_transmit_skb(sk, skb_clone(skb, prio));
+		skb = dccp_skb_entail(sk, skb);
 		/*
 		 * Retransmission timer for active-close: RFC 4340, 8.3 requires
 		 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
@@ -690,6 +690,6 @@ void dccp_send_close(struct sock *sk, const int active)
 		 */
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 					  DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
-	} else
-		dccp_transmit_skb(sk, skb);
+	}
+	dccp_transmit_skb(sk, skb);
 }
-- 
cgit v1.2.3


From 6224877b2ca4be5de96270a8ae490fe2ba11b0e0 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: tcp/dccp: Consolidate common code for RFC 3390 conversion

This patch consolidates the code common to TCP and CCID-2:
 * TCP uses RFC 3390 in a packet-oriented manner (tcp_input.c) and
 * CCID-2 uses RFC 3390 in packet-oriented manner (RFC 4341).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid2.c |  8 ++------
 net/ipv4/tcp_input.c   | 17 ++---------------
 2 files changed, 4 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index c539f79ab8e..fa713227c66 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -596,12 +596,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
 	hctx->ssthresh = ~0U;
 
-	/*
-	 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
-	 * packets for new connections, following the rules from [RFC3390]".
-	 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
-	 */
-	hctx->cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
+	/* Use larger initial windows (RFC 3390, rfc2581bis) */
+	hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
 
 	/* Make sure that Ack Ratio is enabled and within bounds. */
 	max_ratio = DIV_ROUND_UP(hctx->cwnd, 2);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 67ccce2a96b..16d0040de34 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -811,25 +811,12 @@ void tcp_update_metrics(struct sock *sk)
 	}
 }
 
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- *	The RFC specifies a window of no more than 4380 bytes
- *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
- *	is a bit misleading because they use a clamp at 4380 bytes
- *	rather than use a multiplier in the relevant range.
- */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
-	if (!cwnd) {
-		if (tp->mss_cache > 1460)
-			cwnd = 2;
-		else
-			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
-	}
+	if (!cwnd)
+		cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-- 
cgit v1.2.3


From ddab05568eaa70fc92b2aae957136f188f724e9c Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Clean up slow-path input processing

This patch rearranges the order of statements of the slow-path input processing
(i.e. any other state than OPEN), to resolve the following issues.

 1. Dependencies: the order of statements now better matches RFC 4340, 8.5, i.e.
    step 7 is before step 9 (previously 9 was before 7), and parsing options in
    step 8 (which can consume resources) now comes after step 7.
 2. Bug-fix: in state CLOSED, there should not be any sequence number checking
    or option processing. This is why the test for CLOSED has been moved after
    the test for LISTEN.
 3. As before sequence number checks are omitted if in state LISTEN/REQUEST, due
    to the note underneath the table in RFC 4340, 7.5.3.
 4. Packets are now passed on to Ack Vector / CCID processing only after
    - step 7  (receive unexpected packets),
    - step 9  (receive Reset),
    - step 13 (receive CloseReq),
    - step 14 (receive Close)
    and only if the state is PARTOPEN. This simplifies CCID processing:
    - in LISTEN/CLOSED the CCIDs are non-existent;
    - in RESPOND/REQUEST the CCIDs have not yet been negotiated;
    - in CLOSEREQ and active-CLOSING the node has already closed this socket;
    - in passive-CLOSING the client is waiting for its Reset.
    In the last case, RFC 4340, 8.3 leaves it open to ignore further incoming
    data, which is the approach taken here.

As a result of (3), CCID processing is now indeed confined to OPEN/PARTOPEN
states, i.e. congestion control is performed only on the flow of data packets.

This avoids pathological cases of doing congestion control on those messages
which set up and terminate the connection.

I have done a few checks to see if this creates a problem in other parts of
the code. This seems not to be the case; even if there were one, it would be
better to fix it than to perform congestion control on Close/Request/Response
messages. Similarly for Ack Vectors (as they depend on the negotiated CCID).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/input.c | 68 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 9a108ce17fc..b1e38bf9445 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -603,22 +603,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* Caller (dccp_v4_do_rcv) will send Reset */
 		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
 		return 1;
+	} else if (sk->sk_state == DCCP_CLOSED) {
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+		return 1;
 	}
 
-	if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) {
-		if (dccp_check_seqno(sk, skb))
-			goto discard;
-
-		/*
-		 * Step 8: Process options and mark acknowledgeable
-		 */
-		if (dccp_parse_options(sk, NULL, skb))
-			return 1;
+	/* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
+	if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
+		goto discard;
 
-		dccp_handle_ackvec_processing(sk, skb);
-		dccp_deliver_input_to_ccids(sk, skb);
+	/*
+	 *   Step 7: Check for unexpected packet types
+	 *      If (S.is_server and P.type == Response)
+	 *	    or (S.is_client and P.type == Request)
+	 *	    or (S.state == RESPOND and P.type == Data),
+	 *	  Send Sync packet acknowledging P.seqno
+	 *	  Drop packet and return
+	 */
+	if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+	     dh->dccph_type == DCCP_PKT_RESPONSE) ||
+	    (dp->dccps_role == DCCP_ROLE_CLIENT &&
+	     dh->dccph_type == DCCP_PKT_REQUEST) ||
+	    (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
+		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+		goto discard;
 	}
 
+	/*  Step 8: Process options */
+	if (dccp_parse_options(sk, NULL, skb))
+		return 1;
+
 	/*
 	 *  Step 9: Process Reset
 	 *	If P.type == Reset,
@@ -626,41 +640,21 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	 *		S.state := TIMEWAIT
 	 *		Set TIMEWAIT timer
 	 *		Drop packet and return
-	*/
+	 */
 	if (dh->dccph_type == DCCP_PKT_RESET) {
 		dccp_rcv_reset(sk, skb);
 		return 0;
-		/*
-		 *   Step 7: Check for unexpected packet types
-		 *      If (S.is_server and P.type == Response)
-		 *	    or (S.is_client and P.type == Request)
-		 *	    or (S.state == RESPOND and P.type == Data),
-		 *	  Send Sync packet acknowledging P.seqno
-		 *	  Drop packet and return
-		 */
-	} else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
-		    dh->dccph_type == DCCP_PKT_RESPONSE) ||
-		    (dp->dccps_role == DCCP_ROLE_CLIENT &&
-		     dh->dccph_type == DCCP_PKT_REQUEST) ||
-		    (sk->sk_state == DCCP_RESPOND &&
-		     dh->dccph_type == DCCP_PKT_DATA)) {
-		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
-		goto discard;
-	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {	/* Step 13 */
 		if (dccp_rcv_closereq(sk, skb))
 			return 0;
 		goto discard;
-	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {		/* Step 14 */
 		if (dccp_rcv_close(sk, skb))
 			return 0;
 		goto discard;
 	}
 
 	switch (sk->sk_state) {
-	case DCCP_CLOSED:
-		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
-		return 1;
-
 	case DCCP_REQUESTING:
 		queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
 		if (queued >= 0)
@@ -669,8 +663,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		__kfree_skb(skb);
 		return 0;
 
-	case DCCP_RESPOND:
 	case DCCP_PARTOPEN:
+		/* Step 8: if using Ack Vectors, mark packet acknowledgeable */
+		dccp_handle_ackvec_processing(sk, skb);
+		dccp_deliver_input_to_ccids(sk, skb);
+		/* fall through */
+	case DCCP_RESPOND:
 		queued = dccp_rcv_respond_partopen_state_process(sk, skb,
 								 dh, len);
 		break;
-- 
cgit v1.2.3


From d6da3511d6b558d0b017777b61dc08b8fbc06ea4 Mon Sep 17 00:00:00 2001
From: Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Policy-based packet dequeueing infrastructure

This patch adds a generic infrastructure for policy-based dequeueing of
TX packets and provides two policies:
 * a simple FIFO policy (which is the default) and
 * a priority based policy (set via socket options).
Both policies honour the tx_qlen sysctl for the maximum size of the write
queue (can be overridden via socket options).

The priority policy uses skb->priority internally to assign an u32 priority
identifier, using the same ranking as SO_PRIORITY. The skb->priority field
is set to 0 when the packet leaves DCCP. The priority is supplied as ancillary
data using cmsg(3), the patch also provides the requisite parsing routines.

Signed-off-by: Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/Makefile  |   2 +-
 net/dccp/dccp.h    |  12 +++++
 net/dccp/output.c  |   7 ++-
 net/dccp/proto.c   |  67 ++++++++++++++++++++++++++--
 net/dccp/qpolicy.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 206 insertions(+), 8 deletions(-)
 create mode 100644 net/dccp/qpolicy.c

(limited to 'net')

diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index b68440bd7fa..0c1c9af2bf7 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
 
 dccp-y := ccid.o feat.o input.o minisocks.o options.o \
-	  output.o proto.o timer.o ackvec.o
+	  qpolicy.o output.o proto.o timer.o ackvec.o
 
 dccp_ipv4-y := ipv4.o
 
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 74c90cd2767..ce2dd6f6f34 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -234,6 +234,18 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
 			   const enum dccp_pkt_type pkt_type);
 
+/*
+ * TX Packet Dequeueing Interface
+ */
+extern void		dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+extern bool		dccp_qpolicy_full(struct sock *sk);
+extern void		dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+extern struct sk_buff	*dccp_qpolicy_top(struct sock *sk);
+extern struct sk_buff	*dccp_qpolicy_pop(struct sock *sk);
+
+/*
+ * TX Packet Output and TX Timers
+ */
 extern void dccp_write_xmit(struct sock *sk);
 extern void dccp_write_space(struct sock *sk);
 extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index b1eaf7bcfb1..2532797a800 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -241,7 +241,7 @@ static void dccp_xmit_packet(struct sock *sk)
 {
 	int err, len;
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue);
+	struct sk_buff *skb = dccp_qpolicy_pop(sk);
 
 	if (unlikely(skb == NULL))
 		return;
@@ -344,7 +344,7 @@ void dccp_write_xmit(struct sock *sk)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
 
-	while ((skb = skb_peek(&sk->sk_write_queue))) {
+	while ((skb = dccp_qpolicy_top(sk))) {
 		int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
 
 		switch (ccid_packet_dequeue_eval(rc)) {
@@ -358,8 +358,7 @@ void dccp_write_xmit(struct sock *sk)
 			dccp_xmit_packet(sk);
 			break;
 		case CCID_PACKET_ERR:
-			skb_dequeue(&sk->sk_write_queue);
-			kfree_skb(skb);
+			dccp_qpolicy_drop(sk, skb);
 			dccp_pr_debug("packet discarded due to err=%d\n", rc);
 		}
 	}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8c125ffab1c..b56efdd2a42 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -189,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	dp->dccps_rate_last	= jiffies;
 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
+	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
 
 	dccp_init_xmit_timers(sk);
 
@@ -541,6 +542,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_RECV_CSCOV:
 		err = dccp_setsockopt_cscov(sk, val, true);
 		break;
+	case DCCP_SOCKOPT_QPOLICY_ID:
+		if (sk->sk_state != DCCP_CLOSED)
+			err = -EISCONN;
+		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+			err = -EINVAL;
+		else
+			dp->dccps_qpolicy = val;
+		break;
+	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+		if (val < 0)
+			err = -EINVAL;
+		else
+			dp->dccps_tx_qlen = val;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -648,6 +663,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_RECV_CSCOV:
 		val = dp->dccps_pcrlen;
 		break;
+	case DCCP_SOCKOPT_QPOLICY_ID:
+		val = dp->dccps_qpolicy;
+		break;
+	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+		val = dp->dccps_tx_qlen;
+		break;
 	case 128 ... 191:
 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
 					     len, (u32 __user *)optval, optlen);
@@ -690,6 +711,43 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
 #endif
 
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+
+	/*
+	 * Assign an (opaque) qpolicy priority value to skb->priority.
+	 *
+	 * We are overloading this skb field for use with the qpolicy subystem.
+	 * The skb->priority is normally used for the SO_PRIORITY option, which
+	 * is initialised from sk_priority. Since the assignment of sk_priority
+	 * to skb->priority happens later (on layer 3), we overload this field
+	 * for use with queueing priorities as long as the skb is on layer 4.
+	 * The default priority value (if nothing is set) is 0.
+	 */
+	skb->priority = 0;
+
+	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_DCCP)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case DCCP_SCM_PRIORITY:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+				return -EINVAL;
+			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 size_t len)
 {
@@ -705,8 +763,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	lock_sock(sk);
 
-	if (sysctl_dccp_tx_qlen &&
-	    (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
+	if (dccp_qpolicy_full(sk)) {
 		rc = -EAGAIN;
 		goto out_release;
 	}
@@ -734,7 +791,11 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (rc != 0)
 		goto out_discard;
 
-	skb_queue_tail(&sk->sk_write_queue, skb);
+	rc = dccp_msghdr_parse(msg, skb);
+	if (rc != 0)
+		goto out_discard;
+
+	dccp_qpolicy_push(sk, skb);
 	dccp_write_xmit(sk);
 out_release:
 	release_sock(sk);
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 00000000000..414696b0d83
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,126 @@
+/*
+ *  net/dccp/qpolicy.c
+ *
+ *  Policy-based packet dequeueing interface for DCCP.
+ *
+ *  Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License v2
+ *  as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ *	Simple Dequeueing Policy:
+ *	If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+	skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_tx_qlen &&
+	       sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+	return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ *	Priority-based Dequeueing Policy:
+ *	If tx_qlen is different from 0 and the queue has reached its upper bound
+ *	of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+	struct sk_buff *skb, *best = NULL;
+
+	skb_queue_walk(&sk->sk_write_queue, skb)
+		if (best == NULL || skb->priority > best->priority)
+			best = skb;
+	return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+	struct sk_buff *skb, *worst = NULL;
+
+	skb_queue_walk(&sk->sk_write_queue, skb)
+		if (worst == NULL || skb->priority < worst->priority)
+			worst = skb;
+	return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+	if (qpolicy_simple_full(sk))
+		dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+	return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations  -  TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top:  peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+	void		(*push)	(struct sock *sk, struct sk_buff *skb);
+	bool		(*full) (struct sock *sk);
+	struct sk_buff*	(*top)  (struct sock *sk);
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+	[DCCPQ_POLICY_SIMPLE] = {
+		.push = qpolicy_simple_push,
+		.full = qpolicy_simple_full,
+		.top  = qpolicy_simple_top,
+	},
+	[DCCPQ_POLICY_PRIO] = {
+		.push = qpolicy_simple_push,
+		.full = qpolicy_prio_full,
+		.top  = qpolicy_prio_best_skb,
+	},
+};
+
+/*
+ *	Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+	qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+	return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb != NULL) {
+		skb_unlink(skb, &sk->sk_write_queue);
+		kfree_skb(skb);
+	}
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+	return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+	struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+	/* Clear any skb fields that we used internally */
+	skb->priority = 0;
+
+	if (skb)
+		skb_unlink(skb, &sk->sk_write_queue);
+	return skb;
+}
-- 
cgit v1.2.3


From 7d1af6a8d935678248d057564e75e1452409a53c Mon Sep 17 00:00:00 2001
From: Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp qpolicy: Parameter checking of cmsg qpolicy parameters

Ensure that cmsg->cmsg_type value is valid for qpolicy
that is currently in use.

Signed-off-by: Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/dccp.h    |  1 +
 net/dccp/proto.c   |  4 ++++
 net/dccp/qpolicy.c | 23 +++++++++++++++++------
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index ce2dd6f6f34..1585fa26488 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -242,6 +242,7 @@ extern bool		dccp_qpolicy_full(struct sock *sk);
 extern void		dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
 extern struct sk_buff	*dccp_qpolicy_top(struct sock *sk);
 extern struct sk_buff	*dccp_qpolicy_pop(struct sock *sk);
+extern bool		dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
 
 /*
  * TX Packet Output and TX Timers
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index b56efdd2a42..a3caa11aa83 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -735,6 +735,10 @@ static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
 		if (cmsg->cmsg_level != SOL_DCCP)
 			continue;
 
+		if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
+		    !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
+			return -EINVAL;
+
 		switch (cmsg->cmsg_type) {
 		case DCCP_SCM_PRIORITY:
 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
index 414696b0d83..27383f88c75 100644
--- a/net/dccp/qpolicy.c
+++ b/net/dccp/qpolicy.c
@@ -73,17 +73,20 @@ static struct dccp_qpolicy_operations {
 	void		(*push)	(struct sock *sk, struct sk_buff *skb);
 	bool		(*full) (struct sock *sk);
 	struct sk_buff*	(*top)  (struct sock *sk);
+	__be32		params;
 
 } qpol_table[DCCPQ_POLICY_MAX] = {
 	[DCCPQ_POLICY_SIMPLE] = {
-		.push = qpolicy_simple_push,
-		.full = qpolicy_simple_full,
-		.top  = qpolicy_simple_top,
+		.push   = qpolicy_simple_push,
+		.full   = qpolicy_simple_full,
+		.top    = qpolicy_simple_top,
+		.params = 0,
 	},
 	[DCCPQ_POLICY_PRIO] = {
-		.push = qpolicy_simple_push,
-		.full = qpolicy_prio_full,
-		.top  = qpolicy_prio_best_skb,
+		.push   = qpolicy_simple_push,
+		.full   = qpolicy_prio_full,
+		.top    = qpolicy_prio_best_skb,
+		.params = DCCP_SCM_PRIORITY,
 	},
 };
 
@@ -124,3 +127,11 @@ struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
 		skb_unlink(skb, &sk->sk_write_queue);
 	return skb;
 }
+
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
+{
+	/* check if exactly one bit is set */
+	if (!param || (param & (param - 1)))
+		return false;
+	return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
+}
-- 
cgit v1.2.3


From f76fd327a8b32d3ad5b51639faf6f54d18be0981 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Runtime verification of timer resolution

The DCCP base time resolution is 10 microseconds (RFC 4340, 13.1 ... 13.3).

Using a timer with a lower resolution was found to trigger the following
bug warnings/problems on high-speed networks (e.g. local loopback):
 * RTT samples are rounded down to 0 if below resolution;
 * in some cases, negative RTT samples were observed;
 * the CCID-3 feedback timer complains that the feedback interval is 0,
   since the feedback interval is in the order of 1 RTT or less and RTT
   measurement rounded this down to 0;
On an Intel computer this will for instance happen when using a
boot-time parameter of "clocksource=jiffies".

The following system log messages were observed:
  11:24:00 kernel: BUG: delta (0) <= 0 at ccid3_hc_rx_send_feedback()
  11:26:12 kernel: BUG: delta (0) <= 0 at ccid3_hc_rx_send_feedback()
  11:26:30 kernel: dccp_sample_rtt: unusable RTT sample 0, using min
  11:26:30 last message repeated 5 times

This patch defines a global constant for the time resolution, adds this in
timer.c, and checks the available clock resolution at CCID-3 module load time.

When the resolution is worse than 10 microseconds, module loading exits with
a message "socket type not supported".

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 13 +++++++++++++
 net/dccp/dccp.h        |  5 ++++-
 net/dccp/timer.c       |  3 +--
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 0d406f82e43..f566eb76aeb 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -869,6 +869,19 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
 
 static __init int ccid3_module_init(void)
 {
+	struct timespec tp;
+
+	/*
+	 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
+	 * correctly and feedback is sent either too early or too late.
+	 */
+	hrtimer_get_res(CLOCK_MONOTONIC, &tp);
+	if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
+		printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
+		       " resolution - check your clocksource.\n", __func__,
+		       tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
+		return -ESOCKTNOSUPPORT;
+	}
 	return ccid_register(&ccid3);
 }
 module_init(ccid3_module_init);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 1585fa26488..b63a82ccb2b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -86,10 +86,13 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define DCCP_RTO_MAX ((unsigned)(64 * HZ))
 
+/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */
+#define DCCP_TIME_RESOLUTION	10
+
 /*
  * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
  */
-#define DCCP_SANE_RTT_MIN	100
+#define DCCP_SANE_RTT_MIN	(10 * DCCP_TIME_RESOLUTION)
 #define DCCP_FALLBACK_RTT	(USEC_PER_SEC / 5)
 #define DCCP_SANE_RTT_MAX	(3 * USEC_PER_SEC)
 
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index e02d5a94f4c..16359e29e7f 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -281,8 +281,7 @@ u32 dccp_timestamp(void)
 {
 	s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
 
-	do_div(delta, 10);
-	return delta;
+	return div_u64(delta, DCCP_TIME_RESOLUTION);
 }
 EXPORT_SYMBOL_GPL(dccp_timestamp);
 
-- 
cgit v1.2.3


From d0c05fe4448db5cbdd886186860581f736f59ae9 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Simplified handling of TX states

Since CCIDs are only used during the established phase of a connection,
they have very little internal state; this specifically reduces to:

 * "no packet sent" if and only if s == 0, for the TX packet size s;

 * when the first packet has been sent (i.e. `s' > 0), the question is whether
   or not feedback has been received:
   - if a feedback packet is received, "feedback = yes" is set,
   - if the nofeedback timer expires,  "feedback = no"  is set.

Thus the CCID only needs to remember state about whether or not feedback
has been received. This is now implemented using a boolean flag, which is
toggled when a feedback packet arrives or the nofeedback timer expires.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 41 ++++++-----------------------------------
 net/dccp/ccids/ccid3.h | 11 ++---------
 2 files changed, 8 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index f566eb76aeb..5470a978be0 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,31 +49,6 @@ static int ccid3_debug;
 /*
  *	Transmitter Half-Connection Routines
  */
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
-{
-	static char *ccid3_state_names[] = {
-	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
-	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
-	[TFRC_SSTATE_FBACK]    = "FBACK",
-	};
-
-	return ccid3_state_names[state];
-}
-#endif
-
-static void ccid3_hc_tx_set_state(struct sock *sk,
-				  enum ccid3_hc_tx_states state)
-{
-	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	enum ccid3_hc_tx_states oldstate = hctx->state;
-
-	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
-		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
-		       ccid3_tx_state_name(state));
-	WARN_ON(state == oldstate);
-	hctx->state = state;
-}
 
 /*
  * Compute the initial sending rate X_init in the manner of RFC 3390:
@@ -206,16 +181,15 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		goto restart_timer;
 	}
 
-	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
-		       ccid3_tx_state_name(hctx->state));
+	ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk,
+		       hctx->feedback ? "" : "out");
 
 	/* Ignore and do not restart after leaving the established state */
 	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
 		goto out;
 
 	/* Reset feedback state to "no feedback received" */
-	if (hctx->state == TFRC_SSTATE_FBACK)
-		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+	hctx->feedback = false;
 
 	/*
 	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
@@ -290,7 +264,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(skb->len == 0))
 		return -EBADMSG;
 
-	if (hctx->state == TFRC_SSTATE_NO_SENT) {
+	if (hctx->s == 0) {
 		sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
 				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
 		hctx->last_win_count   = 0;
@@ -324,8 +298,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		}
 		ccid3_update_send_interval(hctx);
 
-		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-
 	} else {
 		delay = ktime_us_delta(hctx->t_nom, now);
 		ccid3_pr_debug("delay=%ld\n", (long)delay);
@@ -396,8 +368,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
 	 */
-	if (hctx->state == TFRC_SSTATE_NO_FBACK) {
-		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
+	if (!hctx->feedback) {
+		hctx->feedback = true;
 
 		if (hctx->t_rto == 0) {
 			/*
@@ -502,7 +474,6 @@ static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
 
-	hctx->state = TFRC_SSTATE_NO_SENT;
 	hctx->hist  = NULL;
 	setup_timer(&hctx->no_feedback_timer,
 		    ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 7884159937a..1773a8dd36d 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -70,13 +70,6 @@ enum ccid3_options {
 	TFRC_OPT_RECEIVE_RATE	 = 194,
 };
 
-/* TFRC sender states */
-enum ccid3_hc_tx_states {
-	TFRC_SSTATE_NO_SENT = 1,
-	TFRC_SSTATE_NO_FBACK,
-	TFRC_SSTATE_FBACK,
-};
-
 /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
  *
  * @x - Current sending rate in 64 * bytes per second
@@ -87,7 +80,7 @@ enum ccid3_hc_tx_states {
  * @s - Packet size in bytes
  * @t_rto - Nofeedback Timer setting in usecs
  * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
- * @state - Sender state, one of %ccid3_hc_tx_states
+ * @feedback - Whether feedback has been received or not
  * @last_win_count - Last window counter sent
  * @t_last_win_count - Timestamp of earliest packet with
  *                     last_win_count value sent
@@ -105,7 +98,7 @@ struct ccid3_hc_tx_sock {
 	u32				t_rto;
 	u32				t_ipi;
 	u16				s;
-	enum ccid3_hc_tx_states		state:8;
+	bool				feedback:1;
 	u8				last_win_count;
 	ktime_t				t_last_win_count;
 	struct timer_list		no_feedback_timer;
-- 
cgit v1.2.3


From 8b67ad12b04ef7bdf5d2b4de24fe5a609b26cf12 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp tfrc: Suppress unavoidable "below resolution" warning

In the congestion-avoidance phase a decay of p towards 0 is natural once fewer
losses are encountered. Hence the warning message "p is below resolution" is
not necessary, and thus turned into a debug message by this patch.

The TFRC_SMALLEST_P is needed since in theory p never actually reaches 0. When
no further losses are encountered, the loss interval I_0 grows in length,
causing p to decrease towards 0, causing X_calc = s/(RTT * f(p)) to increase.

With the given minimum-resolution this congestion avoidance phase stops at some
fixed value, an approximation formula has been added to the documentation.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/lib/tfrc_equation.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index bc3dc2b2a84..38239c4d5e1 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -632,8 +632,16 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 
 	if (p <= TFRC_CALC_X_SPLIT) 		{     /* 0.0000 < p <= 0.05   */
 		if (p < TFRC_SMALLEST_P) {	      /* 0.0000 < p <  0.0001 */
-			DCCP_WARN("Value of p (%d) below resolution. "
-				  "Substituting %d\n", p, TFRC_SMALLEST_P);
+			/*
+			 * In the congestion-avoidance phase p decays towards 0
+			 * when there are no further losses, so this case is
+			 * natural. Truncating to p_min = 0.01% means that the
+			 * maximum achievable throughput is limited to about
+			 * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
+			 * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
+			 */
+			tfrc_pr_debug("Value of p (%d) below resolution. "
+				      "Substituting %d\n", p, TFRC_SMALLEST_P);
 			index = 0;
 		} else 				      /* 0.0001 <= p <= 0.05  */
 			index =  p/TFRC_SMALLEST_P - 1;
-- 
cgit v1.2.3


From 24b8d343215919c7a2ba18b9f89a0961e1459cad Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp tfrc: Receiver history initialisation routine

This patch
 1) separates history allocation and initialisation, to facilitate early
    loss detection (implemented by a subsequent patch);

 2) removes duplication by using the existing tfrc_rx_hist_purge() if the
    allocation fails. This is now possible, since the initialisation routine
 3) zeroes out the entire history before using it.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              |  2 +-
 net/dccp/ccids/lib/packet_history.c | 52 +++++++++++++++++++++----------------
 net/dccp/ccids/lib/packet_history.h |  2 +-
 3 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 5470a978be0..36f4992f3c3 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -766,7 +766,7 @@ static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
 
 	hcrx->state = TFRC_RSTATE_NO_DATA;
 	tfrc_lh_init(&hcrx->li_hist);
-	return tfrc_rx_hist_alloc(&hcrx->hist);
+	return tfrc_rx_hist_init(&hcrx->hist, sk);
 }
 
 static void ccid3_hc_rx_exit(struct sock *sk)
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 5c445086690..5b4e1cf8439 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -352,28 +352,6 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
 
-int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
-{
-	int i;
-
-	for (i = 0; i <= TFRC_NDUPACK; i++) {
-		h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
-		if (h->ring[i] == NULL)
-			goto out_free;
-	}
-
-	h->loss_count = h->loss_start = 0;
-	return 0;
-
-out_free:
-	while (i-- != 0) {
-		kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
-		h->ring[i] = NULL;
-	}
-	return -ENOBUFS;
-}
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc);
-
 void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
 {
 	int i;
@@ -386,6 +364,36 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
 
+static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
+{
+	int i;
+
+	memset(h, 0, sizeof(*h));
+
+	for (i = 0; i <= TFRC_NDUPACK; i++) {
+		h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
+		if (h->ring[i] == NULL) {
+			tfrc_rx_hist_purge(h);
+			return -ENOBUFS;
+		}
+	}
+	return 0;
+}
+
+int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk)
+{
+	if (tfrc_rx_hist_alloc(h))
+		return -ENOBUFS;
+	/*
+	 * Initialise first entry with GSR to start loss detection as early as
+	 * possible. Code using this must not use any other fields. The entry
+	 * will be overwritten once the CCID updates its received packets.
+	 */
+	tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
+
 /**
  * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
  */
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 221d8102da5..e9d8097947d 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -153,7 +153,7 @@ extern int  tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 				struct sock *sk);
 extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
 				   const struct sk_buff *skb);
-extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
+extern int  tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
 extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
 
 #endif /* _DCCP_PKT_HIST_ */
-- 
cgit v1.2.3


From d20ed95f8bf3d98d31dbbab8b00bb4c1a4a140f3 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp tfrc: Perform early loss detection

This enables the TFRC code to begin loss detection (as soon as the module
is loaded), using the latest updates from rfc3448bis-06, 6.3.1:

 * when the first data packet(s) are lost or marked, set
 * X_target = s/(2*R) => f(p) = s/(R * X_target) = 2,
 * corresponding to a loss rate of ~ 20.64%.

The handle_loss() function is now called right at the begin of rx_packet_recv()
and thus no longer protected against duplicates: hence a call to rx_duplicate()
has been added.  Such a call makes sense now, as the previous patch initialises
the first entry with a sequence number of GSR.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              | 48 ++++++++++++++++++++++++++++++-------
 net/dccp/ccids/lib/packet_history.c |  3 +++
 2 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 36f4992f3c3..0a7c22598d6 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -577,6 +577,28 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 		hcrx->p_inverse = ~0U;   /* see RFC 4342, 8.5 */
 		break;
 	case CCID3_FBACK_PARAM_CHANGE:
+		if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) {
+			/*
+			 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
+			 * FIXME: in rfc3448bis the receiver returns X_recv=0
+			 * here as it normally would in the first feedback packet.
+			 * However this is not possible yet, since the code still
+			 * uses RFC 3448, i.e.
+			 *    If (p > 0)
+			 *      Calculate X_calc using the TCP throughput equation.
+			 *      X = max(min(X_calc, 2*X_recv), s/t_mbi);
+			 * would bring X down to s/t_mbi. That is why we return
+			 * X_recv according to rfc3448bis-06 for the moment.
+			 */
+			u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT, s = hcrx->s;
+
+			if (s == 0) {
+				DCCP_WARN("No sample for s, using fallback\n");
+				s = TCP_MIN_RCVMSS;
+			}
+			hcrx->x_recv = scaled_div32(s, 2 * rtt);
+			break;
+		}
 		/*
 		 * When parameters change (new loss or p > p_prev), we do not
 		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
@@ -650,6 +672,14 @@ static u32 ccid3_first_li(struct sock *sk)
 	u32 x_recv, p, delta;
 	u64 fval;
 
+	/*
+	 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p
+	 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p
+	 * is about 20.64%. This yields an interval length of 4.84 (rounded up).
+	 */
+	if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA))
+		return 5;
+
 	if (hcrx->rtt == 0) {
 		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
 		hcrx->rtt = DCCP_FALLBACK_RTT;
@@ -683,6 +713,15 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
 	const bool is_data_packet = dccp_data_packet(skb);
 
+	/*
+	 * Perform loss detection and handle pending losses
+	 */
+	if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist,
+				skb, ndp, ccid3_first_li, sk)) {
+		do_feedback = CCID3_FBACK_PARAM_CHANGE;
+		goto done_receiving;
+	}
+
 	if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) {
 		if (is_data_packet) {
 			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
@@ -710,15 +749,6 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		hcrx->bytes_recv += payload;
 	}
 
-	/*
-	 * Perform loss detection and handle pending losses
-	 */
-	if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist,
-				skb, ndp, ccid3_first_li, sk)) {
-		do_feedback = CCID3_FBACK_PARAM_CHANGE;
-		goto done_receiving;
-	}
-
 	if (tfrc_rx_hist_loss_pending(&hcrx->hist))
 		return; /* done receiving */
 
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 5b4e1cf8439..8db34225c00 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -335,6 +335,9 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 {
 	int is_new_loss = 0;
 
+	if (tfrc_rx_hist_duplicate(h, skb))
+		return 0;
+
 	if (h->loss_count == 0) {
 		__do_track_loss(h, skb, ndp);
 	} else if (h->loss_count == 1) {
-- 
cgit v1.2.3


From 3ca7aea04152255bb65275b0018d3c673bc1f4e7 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp tfrc: Return type of update_i_mean is void

This changes the return type of tfrc_lh_update_i_mean() to void, since that
function returns always `false'. This is due to

 	len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;

 	if (len - (s64)cur->li_length <= 0)	/* duplicate or reordered */
		return 0;

which means that update_i_mean can only increase the length of the open loss
interval I_0, and hence the value of I_tot0 (RFC 3448, 5.4). Consequently the
test `i_mean < old_i_mean' at the end of the function always evaluates to false.

There is no known way by which a loss interval can suddenly become shorter,
therefore the return type of the function is changed to void. (That is, under
the given circumstances step (3) in RFC 3448, 6.1 will not occur.)

Further changes:
----------------
 * the function is now called from tfrc_rx_handle_loss, which is equivalent
   to the previous way of calling from rx_packet_recv (it was called whenever
   there was no new or pending loss, now  it is also updated when there is
   a pending loss - this increases the accuracy a bit);
 * added a FIXME to possibly consider NDP counting as per RFC 4342 (this is
   not implemented yet).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              |  8 --------
 net/dccp/ccids/lib/loss_interval.c  | 20 +++++++++++---------
 net/dccp/ccids/lib/loss_interval.h  |  2 +-
 net/dccp/ccids/lib/packet_history.c |  5 +++++
 4 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 0a7c22598d6..50dac01e48c 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -767,15 +767,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (sample != 0)
 			hcrx->rtt = tfrc_ewma(hcrx->rtt, sample, 9);
-
-	} else if (tfrc_lh_update_i_mean(&hcrx->li_hist, skb)) {
-		/*
-		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
-		 * has decreased (resp. p has increased), send feedback now.
-		 */
-		do_feedback = CCID3_FBACK_PARAM_CHANGE;
 	}
-
 	/*
 	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
 	 */
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 5b3ce0688c5..fe5c2a31c77 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -86,21 +86,26 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
 
 /**
  * tfrc_lh_update_i_mean  -  Update the `open' loss interval I_0
- * For recomputing p: returns `true' if p > p_prev  <=>  1/p < 1/p_prev
+ * This updates I_mean as the sequence numbers increase. As a consequence, the
+ * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
+ * decreases, and thus there is no need to send renewed feedback.
  */
-u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
+void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 {
 	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
-	u32 old_i_mean = lh->i_mean;
 	s64 len;
 
 	if (cur == NULL)			/* not initialised */
-		return 0;
+		return;
+
+	/* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
+	if (!dccp_data_packet(skb))
+		return;
 
 	len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
 
 	if (len - (s64)cur->li_length <= 0)	/* duplicate or reordered */
-		return 0;
+		return;
 
 	if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
 		/*
@@ -114,14 +119,11 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 		cur->li_is_closed = 1;
 
 	if (tfrc_lh_length(lh) == 1)		/* due to RFC 3448, 6.3.1 */
-		return 0;
+		return;
 
 	cur->li_length = len;
 	tfrc_lh_calc_i_mean(lh);
-
-	return (lh->i_mean < old_i_mean);
 }
-EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
 
 /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
 static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 246018a3b26..f101ae2cf52 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -69,7 +69,7 @@ struct tfrc_rx_hist;
 
 extern int  tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
 				 u32 (*first_li)(struct sock *), struct sock *);
-extern u8   tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
 extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
 
 #endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 8db34225c00..8ea96903033 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -351,6 +351,11 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 		is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
 		__three_after_loss(h);
 	}
+
+	/* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
+	if (!is_new_loss)
+		tfrc_lh_update_i_mean(lh, skb);
+
 	return is_new_loss;
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
-- 
cgit v1.2.3


From 34a081be8e14b7ada70e069b65b05d54db4af497 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp tfrc: Let dccp_tfrc_lib do the sampling work

This migrates more TFRC-related code into the dccp_tfrc_lib:
 * sampling of the packet size `s' (which is only needed until the first
   loss interval is computed (ccid3_first_li));
 * updating the byte-counter `bytes_recvd' in between sending feedbacks.
The result is a better separation of CCID-3 specific and TFRC specific
code, which aids future integration with ECN and e.g. CCID-4.

Further changes:
----------------
 * replaced magic number of 536 with equivalent constant TCP_MIN_RCVMSS;
   (this constant is also used when no estimate for `s' is available).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              | 38 ++++++++-----------------------------
 net/dccp/ccids/ccid3.h              |  4 ----
 net/dccp/ccids/lib/packet_history.c | 10 ++++++++++
 net/dccp/ccids/lib/packet_history.h | 16 ++++++++++++++++
 net/dccp/proto.c                    |  2 +-
 5 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 50dac01e48c..8744590acb3 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -590,12 +590,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 			 * would bring X down to s/t_mbi. That is why we return
 			 * X_recv according to rfc3448bis-06 for the moment.
 			 */
-			u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT, s = hcrx->s;
+			u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT,
+			    s	= tfrc_rx_hist_packet_size(&hcrx->hist);
 
-			if (s == 0) {
-				DCCP_WARN("No sample for s, using fallback\n");
-				s = TCP_MIN_RCVMSS;
-			}
 			hcrx->x_recv = scaled_div32(s, 2 * rtt);
 			break;
 		}
@@ -617,7 +614,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 		if (delta <= 0)
 			DCCP_BUG("delta (%ld) <= 0", (long)delta);
 		else
-			hcrx->x_recv = scaled_div32(hcrx->bytes_recv, delta);
+			hcrx->x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta);
 		break;
 	default:
 		return;
@@ -628,7 +625,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 
 	hcrx->tstamp_last_feedback = now;
 	hcrx->last_counter	   = dccp_hdr(skb)->dccph_ccval;
-	hcrx->bytes_recv	   = 0;
+	hcrx->hist.bytes_recvd	   = 0;
 
 	dp->dccps_hc_rx_insert_options = 1;
 	dccp_send_ack(sk);
@@ -669,7 +666,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 static u32 ccid3_first_li(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	u32 x_recv, p, delta;
+	u32 x_recv, p, delta,
+	    s = tfrc_rx_hist_packet_size(&hcrx->hist);
 	u64 fval;
 
 	/*
@@ -686,7 +684,7 @@ static u32 ccid3_first_li(struct sock *sk)
 	}
 
 	delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback));
-	x_recv = scaled_div32(hcrx->bytes_recv, delta);
+	x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta);
 	if (x_recv == 0) {		/* would also trigger divide-by-zero */
 		DCCP_WARN("X_recv==0\n");
 		if (hcrx->x_recv == 0) {
@@ -696,8 +694,7 @@ static u32 ccid3_first_li(struct sock *sk)
 		x_recv = hcrx->x_recv;
 	}
 
-	fval = scaled_div(hcrx->s, hcrx->rtt);
-	fval = scaled_div32(fval, x_recv);
+	fval = scaled_div32(scaled_div(s, hcrx->rtt), x_recv);
 	p = tfrc_calc_x_reverse_lookup(fval);
 
 	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
@@ -724,31 +721,12 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
 	if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) {
 		if (is_data_packet) {
-			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
 			do_feedback = CCID3_FBACK_INITIAL;
 			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
-			hcrx->s = payload;
-			/*
-			 * Not necessary to update bytes_recv here,
-			 * since X_recv = 0 for the first feedback packet (cf.
-			 * RFC 3448, 6.3) -- gerrit
-			 */
 		}
 		goto update_records;
 	}
 
-	if (tfrc_rx_hist_duplicate(&hcrx->hist, skb))
-		return; /* done receiving */
-
-	if (is_data_packet) {
-		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
-		/*
-		 * Update moving-average of s and the sum of received payload bytes
-		 */
-		hcrx->s = tfrc_ewma(hcrx->s, payload, 9);
-		hcrx->bytes_recv += payload;
-	}
-
 	if (tfrc_rx_hist_loss_pending(&hcrx->hist))
 		return; /* done receiving */
 
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 1773a8dd36d..0c4fadd85f9 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -124,25 +124,21 @@ enum ccid3_hc_rx_states {
  *
  *  @last_counter  -  Tracks window counter (RFC 4342, 8.1)
  *  @state  -  Receiver state, one of %ccid3_hc_rx_states
- *  @bytes_recv  -  Total sum of DCCP payload bytes
  *  @x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
  *  @rtt  -  Receiver estimate of RTT
  *  @tstamp_last_feedback  -  Time at which last feedback was sent
  *  @hist  -  Packet history (loss detection + RTT sampling)
  *  @li_hist  -  Loss Interval database
- *  @s  -  Received packet size in bytes
  *  @p_inverse  -  Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
  */
 struct ccid3_hc_rx_sock {
 	u8				last_counter:4;
 	enum ccid3_hc_rx_states		state:8;
-	u32				bytes_recv;
 	u32				x_recv;
 	u32				rtt;
 	ktime_t				tstamp_last_feedback;
 	struct tfrc_rx_hist		hist;
 	struct tfrc_loss_hist		li_hist;
-	u16				s;
 #define p_inverse			li_hist.i_mean
 };
 
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 8ea96903033..ee34b456424 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -352,6 +352,16 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 		__three_after_loss(h);
 	}
 
+	/*
+	 * Update moving-average of `s' and the sum of received payload bytes.
+	 */
+	if (dccp_data_packet(skb)) {
+		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+
+		h->packet_size = tfrc_ewma(h->packet_size, payload, 9);
+		h->bytes_recvd += payload;
+	}
+
 	/* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
 	if (!is_new_loss)
 		tfrc_lh_update_i_mean(lh, skb);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index e9d8097947d..b7c87a1a272 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -91,12 +91,16 @@ struct tfrc_rx_hist_entry {
  * @loss_count:		Number of entries in circular history
  * @loss_start:		Movable index (for loss detection)
  * @rtt_sample_prev:	Used during RTT sampling, points to candidate entry
+ * @packet_size:	Packet size in bytes (as per RFC 3448, 3.1)
+ * @bytes_recvd:	Number of bytes received since last sending feedback
  */
 struct tfrc_rx_hist {
 	struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
 	u8			  loss_count:2,
 				  loss_start:2;
 #define rtt_sample_prev		  loss_start
+	u32			  packet_size,
+				  bytes_recvd;
 };
 
 /**
@@ -140,6 +144,18 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
 	return h->loss_count > 0;
 }
 
+/*
+ * Accessor functions to retrieve parameters sampled by the RX history
+ */
+static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
+{
+	if (h->packet_size == 0) {
+		DCCP_WARN("No sample for s, using fallback\n");
+		return TCP_MIN_RCVMSS;
+	}
+	return h->packet_size;
+}
+
 extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
 				    const struct sk_buff *skb, const u64 ndp);
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index a3caa11aa83..ecf3be961e1 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -185,7 +185,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	sk->sk_state		= DCCP_CLOSED;
 	sk->sk_write_space	= dccp_write_space;
 	icsk->icsk_sync_mss	= dccp_sync_mss;
-	dp->dccps_mss_cache	= 536;
+	dp->dccps_mss_cache	= TCP_MIN_RCVMSS;
 	dp->dccps_rate_last	= jiffies;
 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
-- 
cgit v1.2.3


From 2f3e3bbad917c426d3aba03a535809e5699de156 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Remove duplicate RX states

The only state information that the CCID-3 receiver keeps is whether initial
feedback has been sent or not. Further, this overlaps with use of feedback:

 * state == TFRC_RSTATE_NO_DATA as long as no feedback has been sent;
 * state == TFRC_RSTATE_DATA    as soon as the first feedback has been sent.

This patch reduces the duplication, by memorising the type of the last feedback.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 47 +++++------------------------------------------
 net/dccp/ccids/ccid3.h | 14 ++++++++------
 2 files changed, 13 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 8744590acb3..04b183548aa 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -528,40 +528,6 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 /*
  *	Receiver Half-Connection Routines
  */
-
-/* CCID3 feedback types */
-enum ccid3_fback_type {
-	CCID3_FBACK_NONE = 0,
-	CCID3_FBACK_INITIAL,
-	CCID3_FBACK_PERIODIC,
-	CCID3_FBACK_PARAM_CHANGE
-};
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
-{
-	static char *ccid3_rx_state_names[] = {
-	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
-	[TFRC_RSTATE_DATA]    = "DATA",
-	};
-
-	return ccid3_rx_state_names[state];
-}
-#endif
-
-static void ccid3_hc_rx_set_state(struct sock *sk,
-				  enum ccid3_hc_rx_states state)
-{
-	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	enum ccid3_hc_rx_states oldstate = hcrx->state;
-
-	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
-		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
-		       ccid3_rx_state_name(state));
-	WARN_ON(state == oldstate);
-	hcrx->state = state;
-}
-
 static void ccid3_hc_rx_send_feedback(struct sock *sk,
 				      const struct sk_buff *skb,
 				      enum ccid3_fback_type fbtype)
@@ -577,7 +543,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 		hcrx->p_inverse = ~0U;   /* see RFC 4342, 8.5 */
 		break;
 	case CCID3_FBACK_PARAM_CHANGE:
-		if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) {
+		if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
 			/*
 			 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
 			 * FIXME: in rfc3448bis the receiver returns X_recv=0
@@ -626,6 +592,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 	hcrx->tstamp_last_feedback = now;
 	hcrx->last_counter	   = dccp_hdr(skb)->dccph_ccval;
 	hcrx->hist.bytes_recvd	   = 0;
+	hcrx->feedback		   = fbtype;
 
 	dp->dccps_hc_rx_insert_options = 1;
 	dccp_send_ack(sk);
@@ -675,7 +642,7 @@ static u32 ccid3_first_li(struct sock *sk)
 	 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p
 	 * is about 20.64%. This yields an interval length of 4.84 (rounded up).
 	 */
-	if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA))
+	if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
 		return 5;
 
 	if (hcrx->rtt == 0) {
@@ -719,11 +686,9 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		goto done_receiving;
 	}
 
-	if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) {
-		if (is_data_packet) {
+	if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
+		if (is_data_packet)
 			do_feedback = CCID3_FBACK_INITIAL;
-			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
-		}
 		goto update_records;
 	}
 
@@ -764,7 +729,6 @@ static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
 
-	hcrx->state = TFRC_RSTATE_NO_DATA;
 	tfrc_lh_init(&hcrx->li_hist);
 	return tfrc_rx_hist_init(&hcrx->hist, sk);
 }
@@ -779,7 +743,6 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
-	info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->state;
 	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
 	info->tcpi_rcv_rtt  = ccid3_hc_rx_sk(sk)->rtt;
 }
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 0c4fadd85f9..72e110a1100 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -114,16 +114,18 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
     return hctx;
 }
 
-/* TFRC receiver states */
-enum ccid3_hc_rx_states {
-	TFRC_RSTATE_NO_DATA = 1,
-	TFRC_RSTATE_DATA,
+
+enum ccid3_fback_type {
+	CCID3_FBACK_NONE = 0,
+	CCID3_FBACK_INITIAL,
+	CCID3_FBACK_PERIODIC,
+	CCID3_FBACK_PARAM_CHANGE
 };
 
 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
  *
  *  @last_counter  -  Tracks window counter (RFC 4342, 8.1)
- *  @state  -  Receiver state, one of %ccid3_hc_rx_states
+ *  @feedback  -  The type of the feedback last sent
  *  @x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
  *  @rtt  -  Receiver estimate of RTT
  *  @tstamp_last_feedback  -  Time at which last feedback was sent
@@ -133,7 +135,7 @@ enum ccid3_hc_rx_states {
  */
 struct ccid3_hc_rx_sock {
 	u8				last_counter:4;
-	enum ccid3_hc_rx_states		state:8;
+	enum ccid3_fback_type		feedback:4;
 	u32				x_recv;
 	u32				rtt;
 	ktime_t				tstamp_last_feedback;
-- 
cgit v1.2.3


From 2b81143aa3505e2460b24b357996c2f21840ea58 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Always perform receiver RTT sampling

This updates the CCID-3 receiver in part with regard to errata 610 and 611
(http://www.rfc-editor.org/errata_list.php), which change RFC 4342 to use the
Receive Rate as specified in rfc3448bis, requiring to constantly sample the
RTT (or use a sender RTT).

Doing this requires reusing the RX history structure after dealing with a loss.

The patch does not resolve how to compute X_recv if the interval is less
than 1 RTT. A FIXME has been added (and is resolved in subsequent patch).

Furthermore, since this is all TFRC-based functionality, the RTT estimation
is now also performed by the dccp_tfrc_lib module. This further simplifies
the CCID-3 code.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              | 43 +++++++++-----------------
 net/dccp/ccids/ccid3.h              |  2 --
 net/dccp/ccids/lib/packet_history.c | 60 +++++++++++++++++++++++++++----------
 net/dccp/ccids/lib/packet_history.h | 17 +++++++++--
 4 files changed, 73 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 04b183548aa..8e64d9665a2 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -556,8 +556,8 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 			 * would bring X down to s/t_mbi. That is why we return
 			 * X_recv according to rfc3448bis-06 for the moment.
 			 */
-			u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT,
-			    s	= tfrc_rx_hist_packet_size(&hcrx->hist);
+			u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
+			    rtt = tfrc_rx_hist_rtt(&hcrx->hist);
 
 			hcrx->x_recv = scaled_div32(s, 2 * rtt);
 			break;
@@ -576,6 +576,11 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 			break;
 		/* fall through */
 	case CCID3_FBACK_PERIODIC:
+		/*
+		 * FIXME: check if delta is less than or equal to 1 RTT using
+		 * the receiver RTT sample. This is described in Errata 610/611
+		 * of RFC 4342 which reference section 6.2 of RFC 3448.
+		 */
 		delta = ktime_us_delta(now, hcrx->tstamp_last_feedback);
 		if (delta <= 0)
 			DCCP_BUG("delta (%ld) <= 0", (long)delta);
@@ -633,8 +638,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 static u32 ccid3_first_li(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	u32 x_recv, p, delta,
-	    s = tfrc_rx_hist_packet_size(&hcrx->hist);
+	u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
+	    rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p, delta;
 	u64 fval;
 
 	/*
@@ -645,11 +650,6 @@ static u32 ccid3_first_li(struct sock *sk)
 	if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
 		return 5;
 
-	if (hcrx->rtt == 0) {
-		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
-		hcrx->rtt = DCCP_FALLBACK_RTT;
-	}
-
 	delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback));
 	x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta);
 	if (x_recv == 0) {		/* would also trigger divide-by-zero */
@@ -661,7 +661,7 @@ static u32 ccid3_first_li(struct sock *sk)
 		x_recv = hcrx->x_recv;
 	}
 
-	fval = scaled_div32(scaled_div(s, hcrx->rtt), x_recv);
+	fval = scaled_div32(scaled_div(s, rtt), x_recv);
 	p = tfrc_calc_x_reverse_lookup(fval);
 
 	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
@@ -695,26 +695,11 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	if (tfrc_rx_hist_loss_pending(&hcrx->hist))
 		return; /* done receiving */
 
-	/*
-	 * Handle data packets: RTT sampling and monitoring p
-	 */
-	if (unlikely(!is_data_packet))
-		goto update_records;
-
-	if (!tfrc_lh_is_initialised(&hcrx->li_hist)) {
-		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->hist, skb);
-		/*
-		 * Empty loss history: no loss so far, hence p stays 0.
-		 * Sample RTT values, since an RTT estimate is required for the
-		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
-		 */
-		if (sample != 0)
-			hcrx->rtt = tfrc_ewma(hcrx->rtt, sample, 9);
-	}
 	/*
 	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
 	 */
-	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
+	if (is_data_packet &&
+	    SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
 		do_feedback = CCID3_FBACK_PERIODIC;
 
 update_records:
@@ -744,7 +729,7 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
 	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
-	info->tcpi_rcv_rtt  = ccid3_hc_rx_sk(sk)->rtt;
+	info->tcpi_rcv_rtt  = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist);
 }
 
 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
@@ -759,7 +744,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
 		if (len < sizeof(rx_info))
 			return -EINVAL;
 		rx_info.tfrcrx_x_recv = hcrx->x_recv;
-		rx_info.tfrcrx_rtt    = hcrx->rtt;
+		rx_info.tfrcrx_rtt    = tfrc_rx_hist_rtt(&hcrx->hist);
 		rx_info.tfrcrx_p      = tfrc_invert_loss_event_rate(hcrx->p_inverse);
 		len = sizeof(rx_info);
 		val = &rx_info;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 72e110a1100..342235c57bf 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -127,7 +127,6 @@ enum ccid3_fback_type {
  *  @last_counter  -  Tracks window counter (RFC 4342, 8.1)
  *  @feedback  -  The type of the feedback last sent
  *  @x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
- *  @rtt  -  Receiver estimate of RTT
  *  @tstamp_last_feedback  -  Time at which last feedback was sent
  *  @hist  -  Packet history (loss detection + RTT sampling)
  *  @li_hist  -  Loss Interval database
@@ -137,7 +136,6 @@ struct ccid3_hc_rx_sock {
 	u8				last_counter:4;
 	enum ccid3_fback_type		feedback:4;
 	u32				x_recv;
-	u32				rtt;
 	ktime_t				tstamp_last_feedback;
 	struct tfrc_rx_hist		hist;
 	struct tfrc_loss_hist		li_hist;
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index ee34b456424..e2e250aa5c8 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -151,14 +151,31 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
 
+
+static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
+{
+	struct tfrc_rx_hist_entry *tmp = h->ring[a];
+
+	h->ring[a] = h->ring[b];
+	h->ring[b] = tmp;
+}
+
 static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
 {
-	const u8 idx_a = tfrc_rx_hist_index(h, a),
-		 idx_b = tfrc_rx_hist_index(h, b);
-	struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
+	__tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a),
+			       tfrc_rx_hist_index(h, b));
+}
 
-	h->ring[idx_a] = h->ring[idx_b];
-	h->ring[idx_b] = tmp;
+/**
+ * tfrc_rx_hist_resume_rtt_sampling  -  Prepare RX history for RTT sampling
+ * This is called after loss detection has finished, when the history entry
+ * with the index of `loss_count' holds the highest-received sequence number.
+ * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt).
+ */
+static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h)
+{
+	__tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count));
+	h->loss_count = h->loss_start = 0;
 }
 
 /*
@@ -200,8 +217,7 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
 
 		if (dccp_loss_free(s2, s1, n1)) {
 			/* hole is filled: S0, S2, and S1 are consecutive */
-			h->loss_count = 0;
-			h->loss_start = tfrc_rx_hist_index(h, 1);
+			tfrc_rx_hist_resume_rtt_sampling(h);
 		} else
 			/* gap between S2 and S1: just update loss_prev */
 			tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
@@ -254,8 +270,7 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
 
 			if (dccp_loss_free(s1, s2, n2)) {
 				/* entire hole filled by S0, S3, S1, S2 */
-				h->loss_start = tfrc_rx_hist_index(h, 2);
-				h->loss_count = 0;
+				tfrc_rx_hist_resume_rtt_sampling(h);
 			} else {
 				/* gap remains between S1 and S2 */
 				h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -299,8 +314,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
 
 		if (dccp_loss_free(s2, s3, n3)) {
 			/* no gap between S2 and S3: entire hole is filled */
-			h->loss_start = tfrc_rx_hist_index(h, 3);
-			h->loss_count = 0;
+			tfrc_rx_hist_resume_rtt_sampling(h);
 		} else {
 			/* gap between S2 and S3 */
 			h->loss_start = tfrc_rx_hist_index(h, 2);
@@ -340,6 +354,7 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 
 	if (h->loss_count == 0) {
 		__do_track_loss(h, skb, ndp);
+		tfrc_rx_hist_sample_rtt(h, skb);
 	} else if (h->loss_count == 1) {
 		__one_after_loss(h, skb, ndp);
 	} else if (h->loss_count != 2) {
@@ -435,11 +450,24 @@ static inline struct tfrc_rx_hist_entry *
  * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
  * to compute a sample with given data - calling function should check this.
  */
-u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
+void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
 {
-	u32 sample = 0,
-	    delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
-			    tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+	u32 sample = 0, delta_v;
+
+	/*
+	 * When not to sample:
+	 * - on non-data packets
+	 *   (RFC 4342, 8.1: CCVal only fully defined for data packets);
+	 * - when no data packets have been received yet
+	 *   (FIXME: using sampled packet size as indicator here);
+	 * - as long as there are gaps in the sequence space (pending loss).
+	 */
+	if (!dccp_data_packet(skb) || h->packet_size == 0 ||
+	    tfrc_rx_hist_loss_pending(h))
+		return;
+
+	delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
+			tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
 
 	if (delta_v < 1 || delta_v > 4) {	/* unsuitable CCVal delta */
 		if (h->rtt_sample_prev == 2) {	/* previous candidate stored */
@@ -479,6 +507,6 @@ u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
 	h->rtt_sample_prev = 0;	       /* use current entry as next reference */
 keep_ref_for_next_time:
 
-	return sample;
+	h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 9);
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index b7c87a1a272..ba5832bbc34 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -91,6 +91,7 @@ struct tfrc_rx_hist_entry {
  * @loss_count:		Number of entries in circular history
  * @loss_start:		Movable index (for loss detection)
  * @rtt_sample_prev:	Used during RTT sampling, points to candidate entry
+ * @rtt_estimate:	Receiver RTT estimate
  * @packet_size:	Packet size in bytes (as per RFC 3448, 3.1)
  * @bytes_recvd:	Number of bytes received since last sending feedback
  */
@@ -98,7 +99,10 @@ struct tfrc_rx_hist {
 	struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
 	u8			  loss_count:2,
 				  loss_start:2;
+	/* Receiver RTT sampling */
 #define rtt_sample_prev		  loss_start
+	u32			  rtt_estimate;
+	/* Receiver sampling of application payload lengths */
 	u32			  packet_size,
 				  bytes_recvd;
 };
@@ -154,6 +158,15 @@ static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
 		return TCP_MIN_RCVMSS;
 	}
 	return h->packet_size;
+
+}
+static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
+{
+	if (h->rtt_estimate == 0) {
+		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
+		return  DCCP_FALLBACK_RTT;
+	}
+	return h->rtt_estimate;
 }
 
 extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
@@ -167,8 +180,8 @@ extern int  tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 				struct sk_buff *skb, const u64 ndp,
 				u32 (*first_li)(struct sock *sk),
 				struct sock *sk);
-extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
-				   const struct sk_buff *skb);
+extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
+				    const struct sk_buff *skb);
 extern int  tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
 extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
 
-- 
cgit v1.2.3


From 49ffc29a0223adbe0ea7005eea3ab2a03abbeb06 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Clamping RTT values

This extracts the clamping part of dccp_sample_rtt() and makes it available
to other parts of the code (as e.g. used in the next patch).

Note: The function dccp_sample_rtt() now reduces to subtracting the elapsed
time. This could be eliminated but would require shorter prefixes and thus
is not done by this patch - maybe an idea for later.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/dccp.h  |  9 ++++++++-
 net/dccp/input.c | 11 +----------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index b63a82ccb2b..5281190aa19 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -334,7 +334,14 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
 extern int	   dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
 extern void	   dccp_send_close(struct sock *sk, const int active);
 extern int	   dccp_invalid_packet(struct sk_buff *skb);
-extern u32	   dccp_sample_rtt(struct sock *sk, long delta);
+
+static inline u32  dccp_sane_rtt(long usec_sample)
+{
+	if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX))
+		DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample);
+	return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX);
+}
+extern u32 dccp_sample_rtt(struct sock *sk, long delta);
 
 static inline int dccp_bad_service_code(const struct sock *sk,
 					const __be32 service)
diff --git a/net/dccp/input.c b/net/dccp/input.c
index b1e38bf9445..df0e6714aa1 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -707,16 +707,7 @@ u32 dccp_sample_rtt(struct sock *sk, long delta)
 	/* dccpor_elapsed_time is either zeroed out or set and > 0 */
 	delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
 
-	if (unlikely(delta <= 0)) {
-		DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
-		return DCCP_SANE_RTT_MIN;
-	}
-	if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
-		DCCP_WARN("RTT sample %ld too large, using max\n", delta);
-		return DCCP_SANE_RTT_MAX;
-	}
-
-	return delta;
+	return dccp_sane_rtt(delta);
 }
 
 EXPORT_SYMBOL_GPL(dccp_sample_rtt);
-- 
cgit v1.2.3


From 22338f09bd60434a3f1d6608f0fa55972067985f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp tfrc: Increase number of RTT samples

This improves the receiver RTT sampling algorithm so that it tries harder to get
as many RTT samples as possible.

The algorithm is based the concepts presented in RFC 4340, 8.1, using timestamps
and the CCVal window counter. There exist 4 cases for the CCVal difference:
 * == 0: less than RTT/4 passed since last packet -- unusable;
 *  > 4: (much) more than 1 RTT has passed since last packet -- also unusable;
 * == 4: perfect sample (exactly one RTT has passed since last packet);
 * 1..3: sub-optimal sample (between RTT/4 and 3*RTT/4 has passed).

In the last case the algorithm tried to optimise by storing away the candidate
and then re-trying next time. The problem is that
 * a large number of samples is needed to smooth out the inaccuracies of the
   algorithm;
 * the sender may not be sending enough packets to warrant a "next time";
 * hence it is better to use suboptimal samples whenever possible.
The algorithm now stores away the current sample only if the difference is 0.

Applicability and background
----------------------------
A realistic example is MP3 streaming where packets are sent at a rate of less
than one packet per RTT, which means that suitable samples are absent for a
very long time.

The effectiveness of using suboptimal samples (with a delta between 1 and 4) was
confirmed by instrumenting the algorithm with counters. The results of two 20
second test runs were:
 * With the old algorithm and a total of 38442 function calls, only 394 of these
   calls resulted in usable RTT samples (about 1%), and 378 out of these were
   "perfect" samples and 28013 (unused) samples had a delta of 1..3.
 * With the new algorithm and a total of 37057 function calls, 1702 usable RTT
   samples were retrieved (about 4.6%), 5 out of these were "perfect" samples.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/lib/packet_history.c | 83 +++++++++++--------------------------
 1 file changed, 24 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index e2e250aa5c8..5c4ded1cf42 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -427,32 +427,17 @@ int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
 
-/**
- * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
- */
-static inline struct tfrc_rx_hist_entry *
-			tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
-{
-	return h->ring[0];
-}
-
-/**
- * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
- */
-static inline struct tfrc_rx_hist_entry *
-			tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
-{
-	return h->ring[h->rtt_sample_prev];
-}
-
 /**
  * tfrc_rx_hist_sample_rtt  -  Sample RTT from timestamp / CCVal
- * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
- * to compute a sample with given data - calling function should check this.
+ * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss
+ * is pending and uses the following history entries (via rtt_sample_prev):
+ * - h->ring[0]  contains the most recent history entry prior to @skb;
+ * - h->ring[1]  is an unused `dummy' entry when the current difference is 0;
  */
 void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
 {
-	u32 sample = 0, delta_v;
+	struct tfrc_rx_hist_entry *last = h->ring[0];
+	u32 sample, delta_v;
 
 	/*
 	 * When not to sample:
@@ -466,47 +451,27 @@ void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
 	    tfrc_rx_hist_loss_pending(h))
 		return;
 
-	delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
-			tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
-
-	if (delta_v < 1 || delta_v > 4) {	/* unsuitable CCVal delta */
-		if (h->rtt_sample_prev == 2) {	/* previous candidate stored */
-			sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
-				       tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
-			if (sample)
-				sample = 4 / sample *
-				         ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
-							tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
-			else    /*
-				 * FIXME: This condition is in principle not
-				 * possible but occurs when CCID is used for
-				 * two-way data traffic. I have tried to trace
-				 * it, but the cause does not seem to be here.
-				 */
-				DCCP_BUG("please report to dccp@vger.kernel.org"
-					 " => prev = %u, last = %u",
-					 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
-					 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
-		} else if (delta_v < 1) {
-			h->rtt_sample_prev = 1;
-			goto keep_ref_for_next_time;
-		}
-
-	} else if (delta_v == 4) /* optimal match */
-		sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
-	else {			 /* suboptimal match */
-		h->rtt_sample_prev = 2;
-		goto keep_ref_for_next_time;
-	}
+	h->rtt_sample_prev = 0;		/* reset previous candidate */
 
-	if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
-		DCCP_WARN("RTT sample %u too large, using max\n", sample);
-		sample = DCCP_SANE_RTT_MAX;
+	delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval);
+	if (delta_v == 0) {		/* less than RTT/4 difference */
+		h->rtt_sample_prev = 1;
+		return;
 	}
+	sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp)));
 
-	h->rtt_sample_prev = 0;	       /* use current entry as next reference */
-keep_ref_for_next_time:
+	if (delta_v <= 4)		/* between RTT/4 and RTT */
+		sample *= 4 / delta_v;
+	else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2))
+		/*
+		* Optimisation: CCVal difference is greater than 1 RTT, yet the
+		* sample is less than the local RTT estimate; which means that
+		* the RTT estimate is too high.
+		* To avoid noise, it is not done if the sample is below RTT/2.
+		*/
+		return;
 
-	h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 9);
+	/* Use a lower weight than usual to increase responsiveness */
+	h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5);
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
-- 
cgit v1.2.3


From 68c89ee53571a441799c03d5e240c6441bced620 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Update the computation of X_recv

This updates the computation of X_recv with regard to Errata 610/611 for
RFC 4342 and draft rfc3448bis-06, ensuring that at least an interval of 1
RTT is used to compute X_recv.  The change is wrapped into a new function
ccid3_hc_rx_x_recv().

Further changes:
----------------
 * feedback is not sent when no data packets arrived (bytes_recv == 0), as per
   rfc3448bis-06, 6.2;
 * take the timestamp for the feedback /after/ dccp_send_ack() returns, to avoid
   taking the transmission time into account (in case layer-2 is busy);
 * clearer handling of failure in ccid3_first_li().

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              | 64 ++++++++++++++-----------------------
 net/dccp/ccids/lib/packet_history.c | 30 +++++++++++++++++
 net/dccp/ccids/lib/packet_history.h | 13 +++++++-
 3 files changed, 66 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 8e64d9665a2..f2f9514dbad 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -533,9 +533,6 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 				      enum ccid3_fback_type fbtype)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	struct dccp_sock *dp = dccp_sk(sk);
-	ktime_t now = ktime_get_real();
-	s64 delta = 0;
 
 	switch (fbtype) {
 	case CCID3_FBACK_INITIAL:
@@ -565,42 +562,33 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 		/*
 		 * When parameters change (new loss or p > p_prev), we do not
 		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
-		 * need to  reuse the previous value of X_recv. However, when
-		 * X_recv was 0 (due to early loss), this would kill X down to
-		 * s/t_mbi (i.e. one packet in 64 seconds).
-		 * To avoid such drastic reduction, we approximate X_recv as
-		 * the number of bytes since last feedback.
-		 * This is a safe fallback, since X is bounded above by X_calc.
+		 * always check whether at least RTT time units were covered.
 		 */
-		if (hcrx->x_recv > 0)
-			break;
-		/* fall through */
+		hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
+		break;
 	case CCID3_FBACK_PERIODIC:
 		/*
-		 * FIXME: check if delta is less than or equal to 1 RTT using
-		 * the receiver RTT sample. This is described in Errata 610/611
-		 * of RFC 4342 which reference section 6.2 of RFC 3448.
+		 * Step (2) of rfc3448bis-06, 6.2:
+		 * - if no data packets have been received, just restart timer
+		 * - if data packets have been received, re-compute X_recv
 		 */
-		delta = ktime_us_delta(now, hcrx->tstamp_last_feedback);
-		if (delta <= 0)
-			DCCP_BUG("delta (%ld) <= 0", (long)delta);
-		else
-			hcrx->x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta);
+		if (hcrx->hist.bytes_recvd == 0)
+			goto prepare_for_next_time;
+		hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
 		break;
 	default:
 		return;
 	}
 
-	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n",
-		       (long)delta, hcrx->x_recv, hcrx->p_inverse);
+	ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse);
 
-	hcrx->tstamp_last_feedback = now;
-	hcrx->last_counter	   = dccp_hdr(skb)->dccph_ccval;
-	hcrx->hist.bytes_recvd	   = 0;
-	hcrx->feedback		   = fbtype;
-
-	dp->dccps_hc_rx_insert_options = 1;
+	dccp_sk(sk)->dccps_hc_rx_insert_options = 1;
 	dccp_send_ack(sk);
+
+prepare_for_next_time:
+	tfrc_rx_hist_restart_byte_counter(&hcrx->hist);
+	hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
+	hcrx->feedback	   = fbtype;
 }
 
 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
@@ -639,7 +627,7 @@ static u32 ccid3_first_li(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 	u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
-	    rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p, delta;
+	    rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
 	u64 fval;
 
 	/*
@@ -650,16 +638,9 @@ static u32 ccid3_first_li(struct sock *sk)
 	if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
 		return 5;
 
-	delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback));
-	x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta);
-	if (x_recv == 0) {		/* would also trigger divide-by-zero */
-		DCCP_WARN("X_recv==0\n");
-		if (hcrx->x_recv == 0) {
-			DCCP_BUG("stored value of X_recv is zero");
-			return ~0U;
-		}
-		x_recv = hcrx->x_recv;
-	}
+	x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
+	if (x_recv == 0)
+		goto failed;
 
 	fval = scaled_div32(scaled_div(s, rtt), x_recv);
 	p = tfrc_calc_x_reverse_lookup(fval);
@@ -667,7 +648,10 @@ static u32 ccid3_first_li(struct sock *sk)
 	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
 		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
 
-	return p == 0 ? ~0U : scaled_div(1, p);
+	if (p > 0)
+		return scaled_div(1, p);
+failed:
+	return UINT_MAX;
 }
 
 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 5c4ded1cf42..547ad098ea6 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -385,6 +385,36 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
 
+/* Compute the sending rate X_recv measured between feedback intervals */
+u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
+{
+	u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate;
+	s64 delta = ktime_to_us(net_timedelta(h->bytes_start));
+
+	WARN_ON(delta <= 0);
+	/*
+	 * Ensure that the sampling interval for X_recv is at least one RTT,
+	 * by extending the sampling interval backwards in time, over the last
+	 * R_(m-1) seconds, as per rfc3448bis-06, 6.2.
+	 * To reduce noise (e.g. when the RTT changes often), this is only
+	 * done when delta is smaller than RTT/2.
+	 */
+	if (last_x_recv > 0 && delta < last_rtt/2) {
+		tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n",
+			      (long)delta, (unsigned)last_rtt);
+
+		delta = (bytes ? delta : 0) + last_rtt;
+		bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC);
+	}
+
+	if (unlikely(bytes == 0)) {
+		DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv);
+		return last_x_recv;
+	}
+	return scaled_div32(bytes, delta);
+}
+EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv);
+
 void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
 {
 	int i;
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index ba5832bbc34..6552be63cb0 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -93,7 +93,8 @@ struct tfrc_rx_hist_entry {
  * @rtt_sample_prev:	Used during RTT sampling, points to candidate entry
  * @rtt_estimate:	Receiver RTT estimate
  * @packet_size:	Packet size in bytes (as per RFC 3448, 3.1)
- * @bytes_recvd:	Number of bytes received since last sending feedback
+ * @bytes_recvd:	Number of bytes received since @bytes_start
+ * @bytes_start:	Start time for counting @bytes_recvd
  */
 struct tfrc_rx_hist {
 	struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
@@ -105,6 +106,7 @@ struct tfrc_rx_hist {
 	/* Receiver sampling of application payload lengths */
 	u32			  packet_size,
 				  bytes_recvd;
+	ktime_t			  bytes_start;
 };
 
 /**
@@ -169,6 +171,15 @@ static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
 	return h->rtt_estimate;
 }
 
+static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
+{
+	h->bytes_recvd = 0;
+	h->bytes_start = ktime_get_real();
+}
+
+extern u32  tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
+
+
 extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
 				    const struct sk_buff *skb, const u64 ndp);
 
-- 
cgit v1.2.3


From 88e97a93342c0b9e835d510921e7b2df8547d1bd Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Update the RX history records in one place

This patch is a requirement for enabling ECN support later on. With that change
in mind, the following preparations are done:
 * renamed handle_loss() into congestion_event() since it returns true when a
   congestion event happens (it will eventually also take care of ECN packets);
 * lets tfrc_rx_congestion_event() always update the RX history records, since
   this routine needs to be called for each non-duplicate packet anyway;
 * made all involved boolean-type functions to have return type `bool';

Updating the RX history records is now only necessary for the packets received
up to sending the first feedback. The receiver code becomes again simpler.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c              | 37 +++++++++++--------------------------
 net/dccp/ccids/lib/loss_interval.c  | 10 +++++-----
 net/dccp/ccids/lib/loss_interval.h  |  2 +-
 net/dccp/ccids/lib/packet_history.c | 37 ++++++++++++++++++-------------------
 net/dccp/ccids/lib/packet_history.h | 10 +++++-----
 5 files changed, 40 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index f2f9514dbad..aca072b3eda 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -657,41 +657,26 @@ failed:
 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
 	const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
 	const bool is_data_packet = dccp_data_packet(skb);
 
 	/*
 	 * Perform loss detection and handle pending losses
 	 */
-	if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist,
-				skb, ndp, ccid3_first_li, sk)) {
-		do_feedback = CCID3_FBACK_PARAM_CHANGE;
-		goto done_receiving;
-	}
-
-	if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
-		if (is_data_packet)
-			do_feedback = CCID3_FBACK_INITIAL;
-		goto update_records;
-	}
-
-	if (tfrc_rx_hist_loss_pending(&hcrx->hist))
-		return; /* done receiving */
-
+	if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist,
+				     skb, ndp, ccid3_first_li, sk))
+		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE);
+	/*
+	 * Feedback for first non-empty data packet (RFC 3448, 6.3)
+	 */
+	else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet))
+		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL);
 	/*
 	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
 	 */
-	if (is_data_packet &&
-	    SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
-		do_feedback = CCID3_FBACK_PERIODIC;
-
-update_records:
-	tfrc_rx_hist_add_packet(&hcrx->hist, skb, ndp);
-
-done_receiving:
-	if (do_feedback)
-		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
+	else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet &&
+		 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
+		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC);
 }
 
 static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index fe5c2a31c77..b1ae8f8259e 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -140,18 +140,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
  * @sk:		   Used by @calc_first_li in caller-specific way (subtyping)
  * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
  */
-int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
-			 u32 (*calc_first_li)(struct sock *), struct sock *sk)
+bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
+			  u32 (*calc_first_li)(struct sock *), struct sock *sk)
 {
 	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
 
 	if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
-		return 0;
+		return false;
 
 	new = tfrc_lh_demand_next(lh);
 	if (unlikely(new == NULL)) {
 		DCCP_CRIT("Cannot allocate/add loss record.");
-		return 0;
+		return false;
 	}
 
 	new->li_seqno	  = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
@@ -169,7 +169,7 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
 
 		tfrc_lh_calc_i_mean(lh);
 	}
-	return 1;
+	return true;
 }
 EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
 
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index f101ae2cf52..d08a226db43 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -67,7 +67,7 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
 
 struct tfrc_rx_hist;
 
-extern int  tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
 				 u32 (*first_li)(struct sock *), struct sock *);
 extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
 extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 547ad098ea6..cce9f03bda3 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -192,10 +192,8 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
 	u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
 	    s1 = DCCP_SKB_CB(skb)->dccpd_seq;
 
-	if (!dccp_loss_free(s0, s1, n1)) {	/* gap between S0 and S1 */
+	if (!dccp_loss_free(s0, s1, n1))	/* gap between S0 and S1 */
 		h->loss_count = 1;
-		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
-	}
 }
 
 static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
@@ -328,13 +326,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
 }
 
 /**
- *  tfrc_rx_handle_loss  -  Loss detection and further processing
- *  @h:		    The non-empty RX history object
- *  @lh:	    Loss Intervals database to update
- *  @skb:	    Currently received packet
- *  @ndp:	    The NDP count belonging to @skb
- *  @calc_first_li: Caller-dependent computation of first loss interval in @lh
- *  @sk:	    Used by @calc_first_li (see tfrc_lh_interval_add)
+ *  tfrc_rx_congestion_event  -  Loss detection and further processing
+ *  @h:		The non-empty RX history object
+ *  @lh:	Loss Intervals database to update
+ *  @skb:	Currently received packet
+ *  @ndp:	The NDP count belonging to @skb
+ *  @first_li:	Caller-dependent computation of first loss interval in @lh
+ *  @sk:	Used by @calc_first_li (see tfrc_lh_interval_add)
  *  Chooses action according to pending loss, updates LI database when a new
  *  loss was detected, and does required post-processing. Returns 1 when caller
  *  should send feedback, 0 otherwise.
@@ -342,12 +340,12 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
  *  records accordingly, the caller should not perform any more RX history
  *  operations when loss_count is greater than 0 after calling this function.
  */
-int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
-			struct tfrc_loss_hist *lh,
-			struct sk_buff *skb, const u64 ndp,
-			u32 (*calc_first_li)(struct sock *), struct sock *sk)
+bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
+			      struct tfrc_loss_hist *lh,
+			      struct sk_buff *skb, const u64 ndp,
+			      u32 (*first_li)(struct sock *), struct sock *sk)
 {
-	int is_new_loss = 0;
+	bool new_event = false;
 
 	if (tfrc_rx_hist_duplicate(h, skb))
 		return 0;
@@ -355,6 +353,7 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 	if (h->loss_count == 0) {
 		__do_track_loss(h, skb, ndp);
 		tfrc_rx_hist_sample_rtt(h, skb);
+		tfrc_rx_hist_add_packet(h, skb, ndp);
 	} else if (h->loss_count == 1) {
 		__one_after_loss(h, skb, ndp);
 	} else if (h->loss_count != 2) {
@@ -363,7 +362,7 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 		/*
 		 * Update Loss Interval database and recycle RX records
 		 */
-		is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
+		new_event = tfrc_lh_interval_add(lh, h, first_li, sk);
 		__three_after_loss(h);
 	}
 
@@ -378,12 +377,12 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
 	}
 
 	/* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
-	if (!is_new_loss)
+	if (!new_event)
 		tfrc_lh_update_i_mean(lh, skb);
 
-	return is_new_loss;
+	return new_event;
 }
-EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
+EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event);
 
 /* Compute the sending rate X_recv measured between feedback intervals */
 u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 6552be63cb0..555e65cd73a 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -186,11 +186,11 @@ extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
 extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
 
 struct tfrc_loss_hist;
-extern int  tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
-				struct tfrc_loss_hist *lh,
-				struct sk_buff *skb, const u64 ndp,
-				u32 (*first_li)(struct sock *sk),
-				struct sock *sk);
+extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
+				     struct tfrc_loss_hist *lh,
+				     struct sk_buff *skb, const u64 ndp,
+				     u32 (*first_li)(struct sock *sk),
+				     struct sock *sk);
 extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
 				    const struct sk_buff *skb);
 extern int  tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
-- 
cgit v1.2.3


From 9d497a2c9120e31ff417e75f9f5576c4cde11281 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Implement rfc3448bis change to initial-rate computation

The patch updates CCID-3 with regard to the latest rfc3448bis-06:
 * in the first revisions of the draft, MSS was used for the RFC 3390 window;
 * then (from revision #1 to revision #2), it used the packet size `s';
 * now, in this revision (and apparently final), the value is back to MSS.

This change has an implication for the case when no RTT sample is available,
at the time of sending the first packet:

 * with RTT sample, 2*MSS/RTT <= initial_rate <= 4*MSS/RTT;
 * without RTT sample, the initial rate is one packet (s bytes) per second
   (sec. 4.2), but using s instead of MSS here creates an imbalance, since
   this would further reduce the initial sending rate.

Hence the patch uses MSS (called MPS in RFC 4340) in all places.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index aca072b3eda..d654264c510 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -53,18 +53,16 @@ static int ccid3_debug;
 /*
  * Compute the initial sending rate X_init in the manner of RFC 3390:
  *
- *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
+ *	X_init  =  min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT
  *
- * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
- * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
  * For consistency with other parts of the code, X_init is scaled by 2^6.
  */
 static inline u64 rfc3390_initial_rate(struct sock *sk)
 {
-	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	const __u32 w_init = clamp_t(__u32, 4380U, 2 * hctx->s, 4 * hctx->s);
+	const u32 mps = dccp_sk(sk)->dccps_mss_cache,
+	       w_init = clamp(4380U, 2 * mps, 4 * mps);
 
-	return scaled_div(w_init << 6, hctx->rtt);
+	return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt);
 }
 
 /**
@@ -293,7 +291,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
 			 */
 			hctx->rtt = DCCP_FALLBACK_RTT;
-			hctx->x   = hctx->s;
+			hctx->x	  = dp->dccps_mss_cache;
 			hctx->x <<= 6;
 		}
 		ccid3_update_send_interval(hctx);
-- 
cgit v1.2.3


From 891e4d8a402427bc40dee4c8413213a584710372 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Tidy up CCID-Kconfig dependencies

The per-CCID menu has several dependencies on EXPERIMENTAL. These are redundant,
since net/dccp/ccids/Kconfig is sourced by net/dccp/Kconfig and since the
latter menu in turn asserts a dependency on EXPERIMENTAL.

The patch removes the redundant dependencies as well as the repeated reference
within the sub-menu.

Further changes:
----------------
Two single dependencies on CCID-3 are replaced with a single enclosing `if'.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/Kconfig | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 44c7e90248f..dc973abe03b 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,8 +1,7 @@
 menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
 
 config IP_DCCP_CCID2
-	tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
+	tristate "CCID2 (TCP-Like)"
 	def_tristate IP_DCCP
 	---help---
 	  CCID 2, TCP-like Congestion Control, denotes Additive Increase,
@@ -35,7 +34,7 @@ config IP_DCCP_CCID2_DEBUG
 	    If in doubt, say N.
 
 config IP_DCCP_CCID3
-	tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
+	tristate "CCID3 (TCP-Friendly)"
 	def_tristate IP_DCCP
 	select IP_DCCP_TFRC_LIB
 	---help---
@@ -63,9 +62,9 @@ config IP_DCCP_CCID3
 
 	  If in doubt, say M.
 
+if IP_DCCP_CCID3
 config IP_DCCP_CCID3_DEBUG
 	  bool "CCID3 debugging messages"
-	  depends on IP_DCCP_CCID3
 	  ---help---
 	    Enable CCID3-specific debugging messages.
 
@@ -78,7 +77,6 @@ config IP_DCCP_CCID3_DEBUG
 config IP_DCCP_CCID3_RTO
 	  int "Use higher bound for nofeedback timer"
 	  default 100
-	  depends on IP_DCCP_CCID3 && EXPERIMENTAL
 	  ---help---
 	    Use higher lower bound for nofeedback timer expiration.
 
@@ -105,6 +103,7 @@ config IP_DCCP_CCID3_RTO
 	    The purpose of the nofeedback timer is to slow DCCP down when there
 	    is serious network congestion: experimenting with larger values should
 	    therefore not be performed on WANs.
+endif	# IP_DCCP_CCID3
 
 config IP_DCCP_TFRC_LIB
 	tristate
-- 
cgit v1.2.3


From c8f41d50adc380bfb38538ce39ca0ffea5926221 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Measuring the packet size s with regard to rfc3448bis-06

rfc3448bis allows three different ways of tracking the packet size `s':

 1. using the MSS/MPS (at initialisation, 4.2, and in 4.1 (1));
 2. using the average of `s' (in 4.1);
 3. using the maximum of `s' (in 4.2).

Instead of hard-coding a single interpretation of rfc3448bis, this implements
a choice of all three alternatives and suggests the first as default, since it
is the option which is most consistent with other parts of the specification.

The patch further deprecates the update of t_ipi whenever `s' changes. The
gains of doing this are only small since a change of s takes effect at the
next instant X is updated:
 * when the next feedback comes in (within one RTT or less);
 * when the nofeedback timer expires (within at most 4 RTTs).

Further, there are complications caused by updating t_ipi whenever s changes:
 * if t_ipi had previously been updated to effect oscillation prevention (4.5),
   then it is impossible to make the same adjustment to t_ipi again, thus
   counter-acting the algorithm;
 * s may be updated any time and a modification of t_ipi depends on the current
   state (e.g. no oscillation prevention is done in the absence of feedback);
 * in rev-06 of rfc3448bis, there are more possible cases, depending on whether
   the sender is in slow-start (t_ipi <= R/W_init), or in congestion-avoidance,
   limited by X_recv or the throughput equation (t_ipi <= t_mbi).

Thus there are side effects of always updating t_ipi as s changes. These may not
be desirable. The only case I can think of where such an update makes sense is
to recompute X_calc when p > 0 and when s changes (not done by this patch).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/Kconfig | 20 ++++++++++++++++++++
 net/dccp/ccids/ccid3.c | 27 +++++++++++++++------------
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index dc973abe03b..fb168be2cb4 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -74,6 +74,26 @@ config IP_DCCP_CCID3_DEBUG
 
 	    If in doubt, say N.
 
+choice
+	  prompt "Select method for measuring the packet size s"
+	  default IP_DCCP_CCID3_MEASURE_S_AS_MPS
+
+config IP_DCCP_CCID3_MEASURE_S_AS_MPS
+	  bool "Always use MPS in place of s"
+	  ---help---
+	    This use is recommended as it is consistent with the initialisation
+	    of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
+config IP_DCCP_CCID3_MEASURE_S_AS_AVG
+	  bool "Use moving average"
+	  ---help---
+	    An alternative way of tracking s, also supported by rfc3448bis.
+	    This used to be the default for CCID-3 in previous kernels.
+config IP_DCCP_CCID3_MEASURE_S_AS_MAX
+	  bool "Track the maximum payload length"
+	  ---help---
+	    An experimental method based on tracking the maximum packet size.
+endchoice
+
 config IP_DCCP_CCID3_RTO
 	  int "Use higher bound for nofeedback timer"
 	  default 100
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index d654264c510..d77d3e664b7 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -136,17 +136,18 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 }
 
 /*
- *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
- *	@len: DCCP packet payload size in bytes
+ * ccid3_hc_tx_measure_packet_size  -  Measuring the packet size `s' (sec 4.1)
+ * @new_len: DCCP payload size in bytes (not used by all methods)
  */
-static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
+static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
 {
-	const u16 old_s = hctx->s;
-
-	hctx->s = tfrc_ewma(hctx->s, len, 9);
-
-	if (hctx->s != old_s)
-		ccid3_update_send_interval(hctx);
+#if   defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG)
+	return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9);
+#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX)
+	return max(ccid3_hc_tx_sk(sk)->s, new_len);
+#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS	*/
+	return dccp_sk(sk)->dccps_mss_cache;
+#endif
 }
 
 /*
@@ -271,8 +272,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		/* Set t_0 for initial packet */
 		hctx->t_nom = now;
 
-		hctx->s = skb->len;
-
 		/*
 		 * Use initial RTT sample when available: recommended by erratum
 		 * to RFC 4342. This implements the initialisation procedure of
@@ -294,6 +293,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 			hctx->x	  = dp->dccps_mss_cache;
 			hctx->x <<= 6;
 		}
+
+		/* Compute t_ipi = s / X */
+		hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
 		ccid3_update_send_interval(hctx);
 
 	} else {
@@ -326,7 +328,8 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
-	ccid3_hc_tx_update_s(hctx, len);
+	/* Changes to s will become effective the next time X is computed */
+	hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
 
 	if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss))
 		DCCP_CRIT("packet history - out of memory!");
-- 
cgit v1.2.3


From 53ac9570c8145710aaed9e1eb850c2e991a4ebc1 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Simplify computing and range-checking of t_ipi

This patch simplifies the computation of t_ipi, avoiding expensive computations
to enforce the minimum sending rate.

Both RFC 3448 and rfc3448bis (revision #06), as well as RFC 4342 sec 5., require
at various stages that at least one packet must be sent per t_mbi = 64 seconds.
This requires frequent divisions of the type X_min = s/t_mbi, which are later
converted back into an inter-packet-interval t_ipi_max = s/X_min = t_mbi.

The patch removes the expensive indirection; in the unlikely case of having
a sending rate less than one packet per 64 seconds, it also re-adjusts X.

The following cases document conformance with RFC 3448  / rfc3448bis-06:
 1) Time until receiving the first feedback packet:
   * if the sender has no initial RTT sample then X = s/1 Bps > s/t_mbi;
   * if the sender has an initial RTT sample or when the first feedback
     packet is received, X = W_init/R > s/t_mbi.

 2) Slow-start (p == 0 and feedback packets come in):
   * RFC 3448  (current code) enforces a minimum of s/R > s/t_mbi;
   * rfc3448bis (future code) enforces an even higher minimum of W_init/R.

 3) Congestion avoidance with no absence of feedback (p > 0):
   * when X_calc or X_recv/2 are too low, the minimum of X_min = s/t_mbi
     is enforced in update_x() when calling update_send_interval();
   * update_send_interval() is, as before, only called when X changes
     (i.e. either when increasing or decreasing, not when in equilibrium).

 4) Reduction of X without prior feedback or during slow-start (p==0):
   * both RFC 3448 and rfc3448bis here halve X directly;
   * the associated constraint X >= s/t_mbi is nforced here by send_interval().

 5) Reduction of X when p > 0:
   * X is modified indirectly via X_recv (RFC 3448) or X_recv_set (rfc3448bis);
   * in both cases, control goes back to section 4.3 (in both documents);
   * since p > 0, both documents use X = max(min(...), s/t_mbi), which is
     enforced in this patch by calling send_interval() from update_x().

I think that this analysis is exhaustive. Should I have forgotten a case,
the worst-case consideration arises when X sinks below s/t_mbi, and is then
increased back up to this minimum value. Even under this assumption, the
behaviour is correct, since all lower limits of X in RFC 3448 / rfc3448bis
are either equal to or greater than s/t_mbi.

Note on the condition X >= s/t_mbi  <==> t_ipi = s/X <= t_mbi: since X is
scaled by 64, and all time units are in microseconds, the coded condition is:

    t_ipi = s * 64 * 10^6 usec / X <= 64 * 10^6 usec

This simplifies to s / X <= 1 second <==> X * 1 second >= s > 0.
(A zero `s' is not allowed by the CCID-3 code).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index d77d3e664b7..7cd76c6c790 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -66,15 +66,15 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
 }
 
 /**
- * ccid3_update_send_interval  -  Calculate new t_ipi = s / X_inst
- * This respects the granularity of X_inst (64 * bytes/second).
+ * ccid3_update_send_interval  -  Calculate new t_ipi = s / X
+ * This respects the granularity of X (64 * bytes/second) and enforces the
+ * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
  */
 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
 {
+	if (unlikely(hctx->x <= hctx->s))
+		hctx->x	= hctx->s;
 	hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
-
-	ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hctx->t_ipi,
-		       hctx->s, (unsigned)(hctx->x >> 6));
 }
 
 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
@@ -115,7 +115,6 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 	if (hctx->p > 0) {
 
 		hctx->x = min(((u64)hctx->x_calc) << 6, min_rate);
-		hctx->x = max(hctx->x, (((u64)hctx->s) << 6) / TFRC_T_MBI);
 
 	} else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) {
 
@@ -197,8 +196,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 	if (hctx->t_rto == 0 || hctx->p == 0) {
 
 		/* halve send rate directly */
-		hctx->x = max(hctx->x / 2, (((u64)hctx->s) << 6) / TFRC_T_MBI);
+		hctx->x /= 2;
 		ccid3_update_send_interval(hctx);
+
 	} else {
 		/*
 		 *  Modify the cached value of X_recv
@@ -213,9 +213,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		BUG_ON(hctx->p && !hctx->x_calc);
 
 		if (hctx->x_calc > (hctx->x_recv >> 5))
-			hctx->x_recv =
-				max(hctx->x_recv / 2,
-				    (((__u64)hctx->s) << 6) / (2 * TFRC_T_MBI));
+			hctx->x_recv /= 2;
 		else {
 			hctx->x_recv = hctx->x_calc;
 			hctx->x_recv <<= 4;
-- 
cgit v1.2.3


From a3cbdde8e9c38b66b4f13ac5d6ff1939ded0ff20 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-3: Preventing Oscillations

This implements [RFC 3448, 4.5], which performs congestion avoidance behaviour
by reducing the transmit rate as the queueing delay (measured in terms of
long-term RTT) increases.

Oscillation can be turned on/off via a module option (do_osc_prev) and via sysfs
(using mode 0644), the default is off.

Overflow analysis:
------------------
 * oscillation prevention is done after update_x(), so that t_ipi <= 64000;
 * hence the multiplication "t_ipi * sqrt(R_sample)" needs 64 bits;
 * done using u64 for sqrt_sample and explicit typecast of t_ipi;
 * the divisor, R_sqmean, is non-zero because oscillation prevention is first
   called when receiving the second feedback packet, and tfrc_scaled_rtt() > 0.

A detailed discussion of the algorithm (with plots) is on
http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid3/sender_notes/oscillation_prevention/

The algorithm has negative side effects:
  * when allowing to decrease t_ipi (leads to a large RTT) and
  * when using it during slow-start;
both uses are therefore disabled.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 net/dccp/ccids/ccid3.h    |  6 ++++--
 net/dccp/ccids/lib/tfrc.h | 15 +++++++++++++++
 3 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 7cd76c6c790..06cfdad84a6 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,6 +49,8 @@ static int ccid3_debug;
 /*
  *	Transmitter Half-Connection Routines
  */
+/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */
+static int do_osc_prev = true;
 
 /*
  * Compute the initial sending rate X_init in the manner of RFC 3390:
@@ -296,6 +298,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
 		ccid3_update_send_interval(hctx);
 
+		/* Seed value for Oscillation Prevention (sec. 4.5) */
+		hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt);
+
 	} else {
 		delay = ktime_us_delta(hctx->t_nom, now);
 		ccid3_pr_debug("delay=%ld\n", (long)delay);
@@ -400,6 +405,38 @@ done_computing_x:
 			       hctx->s, hctx->p, hctx->x_calc,
 			       (unsigned)(hctx->x_recv >> 6),
 			       (unsigned)(hctx->x >> 6));
+	/*
+	 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to
+	 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
+	 * can be useful if few connections share a link, avoiding that buffer
+	 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
+	 * A useful presentation with background information is in
+	 *    Joerg Widmer, "Equation-Based Congestion Control",
+	 *    MSc Thesis, University of Mannheim, Germany, 2000
+	 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
+	 */
+	if (do_osc_prev) {
+		r_sample = tfrc_scaled_sqrt(r_sample);
+		/*
+		 * The modulation can work in both ways: increase/decrease t_ipi
+		 * according to long-term increases/decreases of the RTT. The
+		 * former is a useful measure, since it works against queue
+		 * build-up. The latter temporarily increases the sending rate,
+		 * so that buffers fill up more quickly. This in turn causes
+		 * the RTT to increase, so that either later reduction becomes
+		 * necessary or the RTT stays at a very high level. Decreasing
+		 * t_ipi is therefore not supported.
+		 * Furthermore, during the initial slow-start phase the RTT
+		 * naturally increases, where using the algorithm would cause
+		 * delays. Hence it is disabled during the initial slow-start.
+		 */
+		if (r_sample > hctx->r_sqmean && hctx->p > 0)
+			hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
+					      hctx->r_sqmean);
+		hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
+		/* update R_sqmean _after_ computing the modulation factor */
+		hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
+	}
 
 	/* unschedule no feedback timer */
 	sk_stop_timer(sk, &hctx->no_feedback_timer);
@@ -749,6 +786,9 @@ static struct ccid_operations ccid3 = {
 	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
 };
 
+module_param(do_osc_prev, bool, 0644);
+MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
+
 #ifdef CONFIG_IP_DCCP_CCID3_DEBUG
 module_param(ccid3_debug, bool, 0644);
 MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 342235c57bf..af6e1bf937d 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,8 +47,8 @@
 /* Two seconds as per RFC 3448 4.2 */
 #define TFRC_INITIAL_TIMEOUT	   (2 * USEC_PER_SEC)
 
-/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
-#define TFRC_T_MBI		   64
+/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */
+#define TFRC_T_MBI		   (64 * USEC_PER_SEC)
 
 /*
  * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
@@ -76,6 +76,7 @@ enum ccid3_options {
  * @x_recv - Receive rate    in 64 * bytes per second
  * @x_calc - Calculated rate in bytes per second
  * @rtt - Estimate of current round trip time in usecs
+ * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5)
  * @p - Current loss event rate (0-1) scaled by 1000000
  * @s - Packet size in bytes
  * @t_rto - Nofeedback Timer setting in usecs
@@ -94,6 +95,7 @@ struct ccid3_hc_tx_sock {
 	u64				x_recv;
 	u32				x_calc;
 	u32				rtt;
+	u16				r_sqmean;
 	u32				p;
 	u32				t_rto;
 	u32				t_ipi;
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index bb47146ac7d..ede12f53de5 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -47,6 +47,21 @@ static inline u32 scaled_div32(u64 a, u64 b)
 	return result;
 }
 
+/**
+ * tfrc_scaled_sqrt  -  Compute scaled integer sqrt(x) for 0 < x < 2^22-1
+ * Uses scaling to improve accuracy of the integer approximation of sqrt(). The
+ * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
+ * clamped RTT samples (dccp_sample_rtt).
+ * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
+ * scaling factor is neutralised. For this purpose, it avoids returning zero.
+ */
+static inline u16 tfrc_scaled_sqrt(const u32 sample)
+{
+	const unsigned long non_zero_sample = sample ? : 1;
+
+	return int_sqrt(non_zero_sample << 10);
+}
+
 /**
  * tfrc_ewma  -  Exponentially weighted moving average
  * @weight: Weight to be used as damping factor, in units of 1/10
-- 
cgit v1.2.3


From fab0de02fb0da83b90cec7fce4294747d86d5c6f Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:32 +0200
Subject: IPVS: Add CONFIG_IP_VS_IPV6 option for IPv6 support

Add boolean config option CONFIG_IP_VS_IPV6 for enabling experimental IPv6
support in IPVS. Only visible if IPv6 support is set to 'y' or both IPv6
and IPVS are modules.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/Kconfig | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 2e48a7e2722..794cecb249a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -24,6 +24,14 @@ menuconfig IP_VS
 
 if IP_VS
 
+config	IP_VS_IPV6
+	bool "IPv6 support for IPVS (DANGEROUS)"
+	depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
+	---help---
+	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+
+	  Say N if unsure.
+
 config	IP_VS_DEBUG
 	bool "IP virtual server debugging"
 	---help---
-- 
cgit v1.2.3


From e7ade46a53055c19a01c8becbe7807f9075d6fee Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:33 +0200
Subject: IPVS: Change IPVS data structures to support IPv6 addresses

Introduce new 'af' fields into IPVS data structures for specifying an
entry's address family. Convert IP addresses to be of type union
nf_inet_addr.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_conn.c      | 60 ++++++++++++++++++++---------------------
 net/ipv4/ipvs/ip_vs_core.c      | 28 +++++++++----------
 net/ipv4/ipvs/ip_vs_ctl.c       | 37 ++++++++++++-------------
 net/ipv4/ipvs/ip_vs_dh.c        |  2 +-
 net/ipv4/ipvs/ip_vs_ftp.c       | 18 ++++++-------
 net/ipv4/ipvs/ip_vs_lblc.c      |  4 +--
 net/ipv4/ipvs/ip_vs_lblcr.c     |  8 +++---
 net/ipv4/ipvs/ip_vs_lc.c        |  2 +-
 net/ipv4/ipvs/ip_vs_nq.c        |  2 +-
 net/ipv4/ipvs/ip_vs_proto_tcp.c | 16 +++++------
 net/ipv4/ipvs/ip_vs_proto_udp.c | 12 ++++-----
 net/ipv4/ipvs/ip_vs_rr.c        |  2 +-
 net/ipv4/ipvs/ip_vs_sed.c       |  2 +-
 net/ipv4/ipvs/ip_vs_sh.c        |  2 +-
 net/ipv4/ipvs/ip_vs_sync.c      |  6 ++---
 net/ipv4/ipvs/ip_vs_wlc.c       |  2 +-
 net/ipv4/ipvs/ip_vs_wrr.c       |  2 +-
 net/ipv4/ipvs/ip_vs_xmit.c      | 12 ++++-----
 18 files changed, 109 insertions(+), 108 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 44a6872dc24..639d4bc7fc1 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -131,7 +131,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	int ret;
 
 	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport);
 
 	ct_write_lock(hash);
 
@@ -162,7 +162,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	int ret;
 
 	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport);
 
 	ct_write_lock(hash);
 
@@ -197,10 +197,10 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (s_addr==cp->caddr && s_port==cp->cport &&
-		    d_port==cp->vport && d_addr==cp->vaddr &&
+		if (s_addr == cp->caddr.ip && s_port == cp->cport &&
+		    d_port == cp->vport && d_addr == cp->vaddr.ip &&
 		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    protocol==cp->protocol) {
+		    protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ct_read_unlock(hash);
@@ -243,10 +243,10 @@ struct ip_vs_conn *ip_vs_ct_in_get
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (s_addr==cp->caddr && s_port==cp->cport &&
-		    d_port==cp->vport && d_addr==cp->vaddr &&
+		if (s_addr == cp->caddr.ip && s_port == cp->cport &&
+		    d_port == cp->vport && d_addr == cp->vaddr.ip &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    protocol==cp->protocol) {
+		    protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			goto out;
@@ -286,8 +286,8 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (d_addr == cp->caddr && d_port == cp->cport &&
-		    s_port == cp->dport && s_addr == cp->daddr &&
+		if (d_addr == cp->caddr.ip && d_port == cp->cport &&
+		    s_port == cp->dport && s_addr == cp->daddr.ip &&
 		    protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
@@ -406,9 +406,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 		  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
 		  "dest->refcnt:%d\n",
 		  ip_vs_proto_name(cp->protocol),
-		  NIPQUAD(cp->caddr), ntohs(cp->cport),
-		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
-		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
 		  ip_vs_fwd_tag(cp), cp->state,
 		  cp->flags, atomic_read(&cp->refcnt),
 		  atomic_read(&dest->refcnt));
@@ -444,8 +444,8 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	struct ip_vs_dest *dest;
 
 	if ((cp) && (!cp->dest)) {
-		dest = ip_vs_find_dest(cp->daddr, cp->dport,
-				       cp->vaddr, cp->vport, cp->protocol);
+		dest = ip_vs_find_dest(cp->daddr.ip, cp->dport,
+				       cp->vaddr.ip, cp->vport, cp->protocol);
 		ip_vs_bind_dest(cp, dest);
 		return dest;
 	} else
@@ -468,9 +468,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
 		  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
 		  "dest->refcnt:%d\n",
 		  ip_vs_proto_name(cp->protocol),
-		  NIPQUAD(cp->caddr), ntohs(cp->cport),
-		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
-		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
 		  ip_vs_fwd_tag(cp), cp->state,
 		  cp->flags, atomic_read(&cp->refcnt),
 		  atomic_read(&dest->refcnt));
@@ -530,9 +530,9 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
 			  "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
 			  "-> d:%u.%u.%u.%u:%d\n",
 			  ip_vs_proto_name(ct->protocol),
-			  NIPQUAD(ct->caddr), ntohs(ct->cport),
-			  NIPQUAD(ct->vaddr), ntohs(ct->vport),
-			  NIPQUAD(ct->daddr), ntohs(ct->dport));
+			  NIPQUAD(ct->caddr.ip), ntohs(ct->cport),
+			  NIPQUAD(ct->vaddr.ip), ntohs(ct->vport),
+			  NIPQUAD(ct->daddr.ip), ntohs(ct->dport));
 
 		/*
 		 * Invalidate the connection template
@@ -641,11 +641,11 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport
 	INIT_LIST_HEAD(&cp->c_list);
 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
 	cp->protocol	   = proto;
-	cp->caddr	   = caddr;
+	cp->caddr.ip	   = caddr;
 	cp->cport	   = cport;
-	cp->vaddr	   = vaddr;
+	cp->vaddr.ip	   = vaddr;
 	cp->vport	   = vport;
-	cp->daddr          = daddr;
+	cp->daddr.ip	   = daddr;
 	cp->dport          = dport;
 	cp->flags	   = flags;
 	spin_lock_init(&cp->lock);
@@ -763,9 +763,9 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 		seq_printf(seq,
 			"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
 				ip_vs_proto_name(cp->protocol),
-				ntohl(cp->caddr), ntohs(cp->cport),
-				ntohl(cp->vaddr), ntohs(cp->vport),
-				ntohl(cp->daddr), ntohs(cp->dport),
+				ntohl(cp->caddr.ip), ntohs(cp->cport),
+				ntohl(cp->vaddr.ip), ntohs(cp->vport),
+				ntohl(cp->daddr.ip), ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
 				(cp->timer.expires-jiffies)/HZ);
 	}
@@ -812,9 +812,9 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
 		seq_printf(seq,
 			"%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n",
 				ip_vs_proto_name(cp->protocol),
-				ntohl(cp->caddr), ntohs(cp->cport),
-				ntohl(cp->vaddr), ntohs(cp->vport),
-				ntohl(cp->daddr), ntohs(cp->dport),
+				ntohl(cp->caddr.ip), ntohs(cp->cport),
+				ntohl(cp->vaddr.ip), ntohs(cp->vport),
+				ntohl(cp->daddr.ip), ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
 				ip_vs_origin_name(cp->flags),
 				(cp->timer.expires-jiffies)/HZ);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 9fbf0a6d739..4a54f33b60d 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -232,14 +232,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 						    snet, 0,
 						    iph->daddr,
 						    ports[1],
-						    dest->addr, dest->port,
+						    dest->addr.ip, dest->port,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			else
 				ct = ip_vs_conn_new(iph->protocol,
 						    snet, 0,
 						    iph->daddr, 0,
-						    dest->addr, 0,
+						    dest->addr.ip, 0,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			if (ct == NULL)
@@ -286,14 +286,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				ct = ip_vs_conn_new(IPPROTO_IP,
 						    snet, 0,
 						    htonl(svc->fwmark), 0,
-						    dest->addr, 0,
+						    dest->addr.ip, 0,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			else
 				ct = ip_vs_conn_new(iph->protocol,
 						    snet, 0,
 						    iph->daddr, 0,
-						    dest->addr, 0,
+						    dest->addr.ip, 0,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			if (ct == NULL)
@@ -313,7 +313,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	cp = ip_vs_conn_new(iph->protocol,
 			    iph->saddr, ports[0],
 			    iph->daddr, ports[1],
-			    dest->addr, dport,
+			    dest->addr.ip, dport,
 			    0,
 			    dest);
 	if (cp == NULL) {
@@ -380,7 +380,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	cp = ip_vs_conn_new(iph->protocol,
 			    iph->saddr, pptr[0],
 			    iph->daddr, pptr[1],
-			    dest->addr, dest->port?dest->port:pptr[1],
+			    dest->addr.ip, dest->port ? dest->port : pptr[1],
 			    0,
 			    dest);
 	if (cp == NULL)
@@ -389,9 +389,9 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
 		  "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
 		  ip_vs_fwd_tag(cp),
-		  NIPQUAD(cp->caddr), ntohs(cp->cport),
-		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
-		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
 		  cp->flags, atomic_read(&cp->refcnt));
 
 	ip_vs_conn_stats(cp, svc);
@@ -526,14 +526,14 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
 
 	if (inout) {
-		iph->saddr = cp->vaddr;
+		iph->saddr = cp->vaddr.ip;
 		ip_send_check(iph);
-		ciph->daddr = cp->vaddr;
+		ciph->daddr = cp->vaddr.ip;
 		ip_send_check(ciph);
 	} else {
-		iph->daddr = cp->daddr;
+		iph->daddr = cp->daddr.ip;
 		ip_send_check(iph);
-		ciph->saddr = cp->daddr;
+		ciph->saddr = cp->daddr.ip;
 		ip_send_check(ciph);
 	}
 
@@ -762,7 +762,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	/* mangle the packet */
 	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 		goto drop;
-	ip_hdr(skb)->saddr = cp->vaddr;
+	ip_hdr(skb)->saddr = cp->vaddr.ip;
 	ip_send_check(ip_hdr(skb));
 
 	/* For policy routing, packets originating from this
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index ede101eeec1..3f2277b847d 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -317,7 +317,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 		/*
 		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
 		 */
-		hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
+		hash = ip_vs_svc_hashkey(svc->protocol, svc->addr.ip,
+					 svc->port);
 		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
 	} else {
 		/*
@@ -373,7 +374,7 @@ __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
 	hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
 
 	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
-		if ((svc->addr == vaddr)
+		if ((svc->addr.ip == vaddr)
 		    && (svc->port == vport)
 		    && (svc->protocol == protocol)) {
 			/* HIT */
@@ -503,7 +504,7 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
 	 *	Hash by proto,addr,port,
 	 *	which are the parameters of the real service.
 	 */
-	hash = ip_vs_rs_hashkey(dest->addr, dest->port);
+	hash = ip_vs_rs_hashkey(dest->addr.ip, dest->port);
 	list_add(&dest->d_list, &ip_vs_rtable[hash]);
 
 	return 1;
@@ -543,7 +544,7 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
 
 	read_lock(&__ip_vs_rs_lock);
 	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
-		if ((dest->addr == daddr)
+		if ((dest->addr.ip == daddr)
 		    && (dest->port == dport)
 		    && ((dest->protocol == protocol) ||
 			dest->vfwmark)) {
@@ -569,7 +570,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
 	 * Find the destination for the given service
 	 */
 	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if ((dest->addr == daddr) && (dest->port == dport)) {
+		if ((dest->addr.ip == daddr) && (dest->port == dport)) {
 			/* HIT */
 			return dest;
 		}
@@ -626,14 +627,14 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
 		IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
 			  "dest->refcnt=%d\n",
 			  dest->vfwmark,
-			  NIPQUAD(dest->addr), ntohs(dest->port),
+			  NIPQUAD(dest->addr.ip), ntohs(dest->port),
 			  atomic_read(&dest->refcnt));
-		if (dest->addr == daddr &&
+		if (dest->addr.ip == daddr &&
 		    dest->port == dport &&
 		    dest->vfwmark == svc->fwmark &&
 		    dest->protocol == svc->protocol &&
 		    (svc->fwmark ||
-		     (dest->vaddr == svc->addr &&
+		     (dest->vaddr.ip == svc->addr.ip &&
 		      dest->vport == svc->port))) {
 			/* HIT */
 			return dest;
@@ -646,7 +647,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
 			IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
 				  "from trash\n",
 				  dest->vfwmark,
-				  NIPQUAD(dest->addr), ntohs(dest->port));
+				  NIPQUAD(dest->addr.ip), ntohs(dest->port));
 			list_del(&dest->n_list);
 			ip_vs_dst_reset(dest);
 			__ip_vs_unbind_svc(dest);
@@ -779,10 +780,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
 	}
 
 	dest->protocol = svc->protocol;
-	dest->vaddr = svc->addr;
+	dest->vaddr.ip = svc->addr.ip;
 	dest->vport = svc->port;
 	dest->vfwmark = svc->fwmark;
-	dest->addr = udest->addr;
+	dest->addr.ip = udest->addr;
 	dest->port = udest->port;
 
 	atomic_set(&dest->activeconns, 0);
@@ -847,7 +848,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 			  NIPQUAD(daddr), ntohs(dport),
 			  atomic_read(&dest->refcnt),
 			  dest->vfwmark,
-			  NIPQUAD(dest->vaddr),
+			  NIPQUAD(dest->vaddr.ip),
 			  ntohs(dest->vport));
 		__ip_vs_update_dest(svc, dest, udest);
 
@@ -993,7 +994,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
 	} else {
 		IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
 			  "dest->refcnt=%d\n",
-			  NIPQUAD(dest->addr), ntohs(dest->port),
+			  NIPQUAD(dest->addr.ip), ntohs(dest->port),
 			  atomic_read(&dest->refcnt));
 		list_add(&dest->n_list, &ip_vs_dest_trash);
 		atomic_inc(&dest->refcnt);
@@ -1101,7 +1102,7 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
 	atomic_set(&svc->refcnt, 0);
 
 	svc->protocol = u->protocol;
-	svc->addr = u->addr;
+	svc->addr.ip = u->addr;
 	svc->port = u->port;
 	svc->fwmark = u->fwmark;
 	svc->flags = u->flags;
@@ -1751,7 +1752,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		if (iter->table == ip_vs_svc_table)
 			seq_printf(seq, "%s  %08X:%04X %s ",
 				   ip_vs_proto_name(svc->protocol),
-				   ntohl(svc->addr),
+				   ntohl(svc->addr.ip),
 				   ntohs(svc->port),
 				   svc->scheduler->name);
 		else
@@ -1768,7 +1769,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		list_for_each_entry(dest, &svc->destinations, n_list) {
 			seq_printf(seq,
 				   "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
-				   ntohl(dest->addr), ntohs(dest->port),
+				   ntohl(dest->addr.ip), ntohs(dest->port),
 				   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
 				   atomic_read(&dest->weight),
 				   atomic_read(&dest->activeconns),
@@ -2040,7 +2041,7 @@ static void
 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
 {
 	dst->protocol = src->protocol;
-	dst->addr = src->addr;
+	dst->addr = src->addr.ip;
 	dst->port = src->port;
 	dst->fwmark = src->fwmark;
 	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
@@ -2114,7 +2115,7 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 			if (count >= get->num_dests)
 				break;
 
-			entry.addr = dest->addr;
+			entry.addr = dest->addr.ip;
 			entry.port = dest->port;
 			entry.conn_flags = atomic_read(&dest->conn_flags);
 			entry.weight = atomic_read(&dest->weight);
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index fa66824d264..9f9d795dbd7 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -218,7 +218,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
 		  "--> server %u.%u.%u.%u:%d\n",
 		  NIPQUAD(iph->daddr),
-		  NIPQUAD(dest->addr),
+		  NIPQUAD(dest->addr.ip),
 		  ntohs(dest->port));
 
 	return dest;
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
index c1c758e4f73..bfe5d7050a5 100644
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -172,17 +172,17 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
 		IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
 			  "%u.%u.%u.%u:%d detected\n",
-			  NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
+			  NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr.ip), 0);
 
 		/*
 		 * Now update or create an connection entry for it
 		 */
 		n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
-					  cp->caddr, 0);
+					  cp->caddr.ip, 0);
 		if (!n_cp) {
 			n_cp = ip_vs_conn_new(IPPROTO_TCP,
-					      cp->caddr, 0,
-					      cp->vaddr, port,
+					      cp->caddr.ip, 0,
+					      cp->vaddr.ip, port,
 					      from, port,
 					      IP_VS_CONN_F_NO_CPORT,
 					      cp->dest);
@@ -196,7 +196,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		/*
 		 * Replace the old passive address with the new one
 		 */
-		from = n_cp->vaddr;
+		from = n_cp->vaddr.ip;
 		port = n_cp->vport;
 		sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
 			(ntohs(port)>>8)&255, ntohs(port)&255);
@@ -306,16 +306,16 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	 */
 	IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
 		  ip_vs_proto_name(iph->protocol),
-		  NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
+		  NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
 
 	n_cp = ip_vs_conn_in_get(iph->protocol,
 				 to, port,
-				 cp->vaddr, htons(ntohs(cp->vport)-1));
+				 cp->vaddr.ip, htons(ntohs(cp->vport)-1));
 	if (!n_cp) {
 		n_cp = ip_vs_conn_new(IPPROTO_TCP,
 				      to, port,
-				      cp->vaddr, htons(ntohs(cp->vport)-1),
-				      cp->daddr, htons(ntohs(cp->dport)-1),
+				      cp->vaddr.ip, htons(ntohs(cp->vport)-1),
+				      cp->daddr.ip, htons(ntohs(cp->dport)-1),
 				      0,
 				      cp->dest);
 		if (!n_cp)
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index d2a43aa3fe4..69309edc0c4 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -422,7 +422,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
 
 	IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr), ntohs(least->port),
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
 		  atomic_read(&least->activeconns),
 		  atomic_read(&least->refcnt),
 		  atomic_read(&least->weight), loh);
@@ -506,7 +506,7 @@ out:
 	IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
 		  "--> server %u.%u.%u.%u:%d\n",
 		  NIPQUAD(iph->daddr),
-		  NIPQUAD(dest->addr),
+		  NIPQUAD(dest->addr.ip),
 		  ntohs(dest->port));
 
 	return dest;
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 375a1ffb6b6..51c746e2083 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -204,7 +204,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
 
 	IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr), ntohs(least->port),
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
 		  atomic_read(&least->activeconns),
 		  atomic_read(&least->refcnt),
 		  atomic_read(&least->weight), loh);
@@ -250,7 +250,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
 
 	IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(most->addr), ntohs(most->port),
+		  NIPQUAD(most->addr.ip), ntohs(most->port),
 		  atomic_read(&most->activeconns),
 		  atomic_read(&most->refcnt),
 		  atomic_read(&most->weight), moh);
@@ -598,7 +598,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
 
 	IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr), ntohs(least->port),
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
 		  atomic_read(&least->activeconns),
 		  atomic_read(&least->refcnt),
 		  atomic_read(&least->weight), loh);
@@ -706,7 +706,7 @@ out:
 	IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
 		  "--> server %u.%u.%u.%u:%d\n",
 		  NIPQUAD(iph->daddr),
-		  NIPQUAD(dest->addr),
+		  NIPQUAD(dest->addr.ip),
 		  ntohs(dest->port));
 
 	return dest;
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
index 2c3de1b6351..551d293347f 100644
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -68,7 +68,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	if (least)
 	IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
-		  NIPQUAD(least->addr), ntohs(least->port),
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
 		  atomic_read(&least->activeconns),
 		  atomic_read(&least->inactconns));
 
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
index 5330d5a2de1..aa0e32ad348 100644
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -101,7 +101,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
   out:
 	IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr), ntohs(least->port),
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
 		  atomic_read(&least->activeconns),
 		  atomic_read(&least->refcnt),
 		  atomic_read(&least->weight), loh);
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index d0ea467986a..15860e1441b 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -147,7 +147,7 @@ tcp_snat_handler(struct sk_buff *skb,
 	/* Adjust TCP checksums */
 	if (!cp->app) {
 		/* Only port and addr are changed, do fast csum update */
-		tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
+		tcp_fast_csum_update(tcph, cp->daddr.ip, cp->vaddr.ip,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -155,7 +155,7 @@ tcp_snat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		tcph->check = 0;
 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-		tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+		tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip,
 						skb->len - tcphoff,
 						cp->protocol, skb->csum);
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
@@ -198,7 +198,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	 */
 	if (!cp->app) {
 		/* Only port and addr are changed, do fast csum update */
-		tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
+		tcp_fast_csum_update(tcph, cp->vaddr.ip, cp->daddr.ip,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -206,7 +206,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		tcph->check = 0;
 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-		tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+		tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip,
 						skb->len - tcphoff,
 						cp->protocol, skb->csum);
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -427,8 +427,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 			  th->fin? 'F' : '.',
 			  th->ack? 'A' : '.',
 			  th->rst? 'R' : '.',
-			  NIPQUAD(cp->daddr), ntohs(cp->dport),
-			  NIPQUAD(cp->caddr), ntohs(cp->cport),
+			  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
+			  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
 			  tcp_state_name(cp->state),
 			  tcp_state_name(new_state),
 			  atomic_read(&cp->refcnt));
@@ -549,8 +549,8 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 			IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
 				  "%u.%u.%u.%u:%u to app %s on port %u\n",
 				  __func__,
-				  NIPQUAD(cp->caddr), ntohs(cp->cport),
-				  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+				  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
+				  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
 				  inc->name, ntohs(inc->port));
 			cp->app = inc;
 			if (inc->init_conn)
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index c6be5d56823..8dfad5db829 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -158,7 +158,7 @@ udp_snat_handler(struct sk_buff *skb,
 	 */
 	if (!cp->app && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
-		udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
+		udp_fast_csum_update(udph, cp->daddr.ip, cp->vaddr.ip,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -166,7 +166,7 @@ udp_snat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		udph->check = 0;
 		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-		udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+		udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip,
 						skb->len - udphoff,
 						cp->protocol, skb->csum);
 		if (udph->check == 0)
@@ -211,7 +211,7 @@ udp_dnat_handler(struct sk_buff *skb,
 	 */
 	if (!cp->app && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
-		udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
+		udp_fast_csum_update(udph, cp->vaddr.ip, cp->daddr.ip,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -219,7 +219,7 @@ udp_dnat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		udph->check = 0;
 		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-		udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+		udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip,
 						skb->len - udphoff,
 						cp->protocol, skb->csum);
 		if (udph->check == 0)
@@ -343,8 +343,8 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
 			IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
 				  "%u.%u.%u.%u:%u to app %s on port %u\n",
 				  __func__,
-				  NIPQUAD(cp->caddr), ntohs(cp->cport),
-				  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+				  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
+				  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
 				  inc->name, ntohs(inc->port));
 			cp->app = inc;
 			if (inc->init_conn)
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
index f7492911753..27f0b624283 100644
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -76,7 +76,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	write_unlock(&svc->sched_lock);
 	IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
 		  "activeconns %d refcnt %d weight %d\n",
-		  NIPQUAD(dest->addr), ntohs(dest->port),
+		  NIPQUAD(dest->addr.ip), ntohs(dest->port),
 		  atomic_read(&dest->activeconns),
 		  atomic_read(&dest->refcnt), atomic_read(&dest->weight));
 
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
index 53f73bea66c..38b574b2fdf 100644
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -103,7 +103,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr), ntohs(least->port),
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
 		  atomic_read(&least->activeconns),
 		  atomic_read(&least->refcnt),
 		  atomic_read(&least->weight), loh);
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 7b979e22805..c9e54e2ec29 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -215,7 +215,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
 		  "--> server %u.%u.%u.%u:%d\n",
 		  NIPQUAD(iph->saddr),
-		  NIPQUAD(dest->addr),
+		  NIPQUAD(dest->addr.ip),
 		  ntohs(dest->port));
 
 	return dest;
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index a652da2c320..2cf47b2e166 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -256,9 +256,9 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
 	s->cport = cp->cport;
 	s->vport = cp->vport;
 	s->dport = cp->dport;
-	s->caddr = cp->caddr;
-	s->vaddr = cp->vaddr;
-	s->daddr = cp->daddr;
+	s->caddr = cp->caddr.ip;
+	s->vaddr = cp->vaddr.ip;
+	s->daddr = cp->daddr.ip;
 	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
 	s->state = htons(cp->state);
 	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
index df7ad8d7476..09fd993040f 100644
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -91,7 +91,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
 		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr), ntohs(least->port),
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
 		  atomic_read(&least->activeconns),
 		  atomic_read(&least->refcnt),
 		  atomic_read(&least->weight), loh);
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
index 0d86a79b87b..19c49b234f3 100644
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -197,7 +197,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
 		  "activeconns %d refcnt %d weight %d\n",
-		  NIPQUAD(dest->addr), ntohs(dest->port),
+		  NIPQUAD(dest->addr.ip), ntohs(dest->port),
 		  atomic_read(&dest->activeconns),
 		  atomic_read(&dest->refcnt),
 		  atomic_read(&dest->weight));
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index 9892d4aca42..88199c9f2d3 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -71,7 +71,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 				.oif = 0,
 				.nl_u = {
 					.ip4_u = {
-						.daddr = dest->addr,
+						.daddr = dest->addr.ip,
 						.saddr = 0,
 						.tos = rtos, } },
 			};
@@ -80,12 +80,12 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip_route_output error, "
 					     "dest: %u.%u.%u.%u\n",
-					     NIPQUAD(dest->addr));
+					     NIPQUAD(dest->addr.ip));
 				return NULL;
 			}
 			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
 			IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
-				  NIPQUAD(dest->addr),
+				  NIPQUAD(dest->addr.ip),
 				  atomic_read(&rt->u.dst.__refcnt), rtos);
 		}
 		spin_unlock(&dest->dst_lock);
@@ -94,14 +94,14 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 			.oif = 0,
 			.nl_u = {
 				.ip4_u = {
-					.daddr = cp->daddr,
+					.daddr = cp->daddr.ip,
 					.saddr = 0,
 					.tos = rtos, } },
 		};
 
 		if (ip_route_output_key(&init_net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: "
-				     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
+				     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
 			return NULL;
 		}
 	}
@@ -264,7 +264,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
 		goto tx_error;
-	ip_hdr(skb)->daddr = cp->daddr;
+	ip_hdr(skb)->daddr = cp->daddr.ip;
 	ip_send_check(ip_hdr(skb));
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
-- 
cgit v1.2.3


From c860c6b1479992440e4962e9c95d258bfdce4fca Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:36 +0200
Subject: IPVS: Add internal versions of sockopt interface structs

Add extended internal versions of struct ip_vs_service_user and struct
ip_vs_dest_user (the originals can't be modified as they are part
of the old sockopt interface). Adjust ip_vs_ctl.c to work with the new
data structures and add some minor AF-awareness.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 138 ++++++++++++++++++++++++++++++----------------
 1 file changed, 90 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 3f2277b847d..a0c8b7bb553 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -708,7 +708,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
  */
 static void
 __ip_vs_update_dest(struct ip_vs_service *svc,
-		    struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
+		    struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
 {
 	int conn_flags;
 
@@ -717,7 +717,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
 	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
 
 	/* check if local node and update the flags */
-	if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) {
+	if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
 		conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
 			| IP_VS_CONN_F_LOCALNODE;
 	}
@@ -761,7 +761,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
  *	Create a destination for the given service
  */
 static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 	       struct ip_vs_dest **dest_p)
 {
 	struct ip_vs_dest *dest;
@@ -769,7 +769,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
 
 	EnterFunction(2);
 
-	atype = inet_addr_type(&init_net, udest->addr);
+	atype = inet_addr_type(&init_net, udest->addr.ip);
 	if (atype != RTN_LOCAL && atype != RTN_UNICAST)
 		return -EINVAL;
 
@@ -779,11 +779,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
 		return -ENOMEM;
 	}
 
+	dest->af = svc->af;
 	dest->protocol = svc->protocol;
-	dest->vaddr.ip = svc->addr.ip;
+	dest->vaddr = svc->addr;
 	dest->vport = svc->port;
 	dest->vfwmark = svc->fwmark;
-	dest->addr.ip = udest->addr;
+	ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
 	dest->port = udest->port;
 
 	atomic_set(&dest->activeconns, 0);
@@ -808,10 +809,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
  *	Add a destination into an existing service
  */
 static int
-ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 {
 	struct ip_vs_dest *dest;
-	__be32 daddr = udest->addr;
+	union nf_inet_addr daddr;
 	__be16 dport = udest->port;
 	int ret;
 
@@ -828,10 +829,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 		return -ERANGE;
 	}
 
+	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
 	/*
 	 * Check if the dest already exists in the list
 	 */
-	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	dest = ip_vs_lookup_dest(svc, daddr.ip, dport);
 	if (dest != NULL) {
 		IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
 		return -EEXIST;
@@ -841,7 +844,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 	 * Check if the dest already exists in the trash and
 	 * is from the same service
 	 */
-	dest = ip_vs_trash_get_dest(svc, daddr, dport);
+	dest = ip_vs_trash_get_dest(svc, daddr.ip, dport);
 	if (dest != NULL) {
 		IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
 			  "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
@@ -916,10 +919,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
  *	Edit a destination in the given service
  */
 static int
-ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 {
 	struct ip_vs_dest *dest;
-	__be32 daddr = udest->addr;
+	union nf_inet_addr daddr;
 	__be16 dport = udest->port;
 
 	EnterFunction(2);
@@ -935,10 +938,12 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 		return -ERANGE;
 	}
 
+	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
 	/*
 	 *  Lookup the destination list
 	 */
-	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	dest = ip_vs_lookup_dest(svc, daddr.ip, dport);
 	if (dest == NULL) {
 		IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
 		return -ENOENT;
@@ -1029,15 +1034,15 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
  *	Delete a destination server in the given service
  */
 static int
-ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
+ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 {
 	struct ip_vs_dest *dest;
-	__be32 daddr = udest->addr;
 	__be16 dport = udest->port;
 
 	EnterFunction(2);
 
-	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	dest = ip_vs_lookup_dest(svc, udest->addr.ip, dport);
+
 	if (dest == NULL) {
 		IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
 		return -ENOENT;
@@ -1072,7 +1077,8 @@ ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
  *	Add a service into the service hash table
  */
 static int
-ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
+ip_vs_add_service(struct ip_vs_service_user_kern *u,
+		  struct ip_vs_service **svc_p)
 {
 	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
@@ -1101,8 +1107,9 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
 	atomic_set(&svc->usecnt, 1);
 	atomic_set(&svc->refcnt, 0);
 
+	svc->af = u->af;
 	svc->protocol = u->protocol;
-	svc->addr.ip = u->addr;
+	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
 	svc->port = u->port;
 	svc->fwmark = u->fwmark;
 	svc->flags = u->flags;
@@ -1161,7 +1168,7 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
  *	Edit a service and bind it with a new scheduler
  */
 static int
-ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 {
 	struct ip_vs_scheduler *sched, *old_sched;
 	int ret = 0;
@@ -1905,14 +1912,44 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
 	[SET_CMDID(IP_VS_SO_SET_ZERO)]		= SERVICE_ARG_LEN,
 };
 
+static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
+				  struct ip_vs_service_user *usvc_compat)
+{
+	usvc->af		= AF_INET;
+	usvc->protocol		= usvc_compat->protocol;
+	usvc->addr.ip		= usvc_compat->addr;
+	usvc->port		= usvc_compat->port;
+	usvc->fwmark		= usvc_compat->fwmark;
+
+	/* Deep copy of sched_name is not needed here */
+	usvc->sched_name	= usvc_compat->sched_name;
+
+	usvc->flags		= usvc_compat->flags;
+	usvc->timeout		= usvc_compat->timeout;
+	usvc->netmask		= usvc_compat->netmask;
+}
+
+static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
+				   struct ip_vs_dest_user *udest_compat)
+{
+	udest->addr.ip		= udest_compat->addr;
+	udest->port		= udest_compat->port;
+	udest->conn_flags	= udest_compat->conn_flags;
+	udest->weight		= udest_compat->weight;
+	udest->u_threshold	= udest_compat->u_threshold;
+	udest->l_threshold	= udest_compat->l_threshold;
+}
+
 static int
 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
 	int ret;
 	unsigned char arg[MAX_ARG_LEN];
-	struct ip_vs_service_user *usvc;
+	struct ip_vs_service_user *usvc_compat;
+	struct ip_vs_service_user_kern usvc;
 	struct ip_vs_service *svc;
-	struct ip_vs_dest_user *udest;
+	struct ip_vs_dest_user *udest_compat;
+	struct ip_vs_dest_user_kern udest;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -1952,35 +1989,40 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		goto out_unlock;
 	}
 
-	usvc = (struct ip_vs_service_user *)arg;
-	udest = (struct ip_vs_dest_user *)(usvc + 1);
+	usvc_compat = (struct ip_vs_service_user *)arg;
+	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
+
+	/* We only use the new structs internally, so copy userspace compat
+	 * structs to extended internal versions */
+	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
+	ip_vs_copy_udest_compat(&udest, udest_compat);
 
 	if (cmd == IP_VS_SO_SET_ZERO) {
 		/* if no service address is set, zero counters in all */
-		if (!usvc->fwmark && !usvc->addr && !usvc->port) {
+		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
 			ret = ip_vs_zero_all();
 			goto out_unlock;
 		}
 	}
 
 	/* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
-	if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
+	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
 		IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
-			  usvc->protocol, NIPQUAD(usvc->addr),
-			  ntohs(usvc->port), usvc->sched_name);
+			  usvc.protocol, NIPQUAD(usvc.addr.ip),
+			  ntohs(usvc.port), usvc.sched_name);
 		ret = -EFAULT;
 		goto out_unlock;
 	}
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
-	if (usvc->fwmark == 0)
-		svc = __ip_vs_service_get(usvc->protocol,
-					  usvc->addr, usvc->port);
+	if (usvc.fwmark == 0)
+		svc = __ip_vs_service_get(usvc.protocol,
+					  usvc.addr.ip, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+		svc = __ip_vs_svc_fwm_get(usvc.fwmark);
 
 	if (cmd != IP_VS_SO_SET_ADD
-	    && (svc == NULL || svc->protocol != usvc->protocol)) {
+	    && (svc == NULL || svc->protocol != usvc.protocol)) {
 		ret = -ESRCH;
 		goto out_unlock;
 	}
@@ -1990,10 +2032,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		if (svc != NULL)
 			ret = -EEXIST;
 		else
-			ret = ip_vs_add_service(usvc, &svc);
+			ret = ip_vs_add_service(&usvc, &svc);
 		break;
 	case IP_VS_SO_SET_EDIT:
-		ret = ip_vs_edit_service(svc, usvc);
+		ret = ip_vs_edit_service(svc, &usvc);
 		break;
 	case IP_VS_SO_SET_DEL:
 		ret = ip_vs_del_service(svc);
@@ -2004,13 +2046,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		ret = ip_vs_zero_service(svc);
 		break;
 	case IP_VS_SO_SET_ADDDEST:
-		ret = ip_vs_add_dest(svc, udest);
+		ret = ip_vs_add_dest(svc, &udest);
 		break;
 	case IP_VS_SO_SET_EDITDEST:
-		ret = ip_vs_edit_dest(svc, udest);
+		ret = ip_vs_edit_dest(svc, &udest);
 		break;
 	case IP_VS_SO_SET_DELDEST:
-		ret = ip_vs_del_dest(svc, udest);
+		ret = ip_vs_del_dest(svc, &udest);
 		break;
 	default:
 		ret = -EINVAL;
@@ -2517,7 +2559,7 @@ nla_put_failure:
 	return skb->len;
 }
 
-static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc,
+static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 				    struct nlattr *nla, int full_entry)
 {
 	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
@@ -2537,6 +2579,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc,
 	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
 		return -EINVAL;
 
+	usvc->af = nla_get_u16(nla_af);
 	/* For now, only support IPv4 */
 	if (nla_get_u16(nla_af) != AF_INET)
 		return -EAFNOSUPPORT;
@@ -2572,7 +2615,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc,
 		if (usvc->fwmark)
 			svc = __ip_vs_svc_fwm_get(usvc->fwmark);
 		else
-			svc = __ip_vs_service_get(usvc->protocol, usvc->addr,
+			svc = __ip_vs_service_get(usvc->protocol, usvc->addr.ip,
 						  usvc->port);
 		if (svc) {
 			usvc->flags = svc->flags;
@@ -2583,9 +2626,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc,
 		/* set new flags from userland */
 		usvc->flags = (usvc->flags & ~flags.mask) |
 			      (flags.flags & flags.mask);
-
-		strlcpy(usvc->sched_name, nla_data(nla_sched),
-			sizeof(usvc->sched_name));
+		usvc->sched_name = nla_data(nla_sched);
 		usvc->timeout = nla_get_u32(nla_timeout);
 		usvc->netmask = nla_get_u32(nla_netmask);
 	}
@@ -2595,7 +2636,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc,
 
 static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 {
-	struct ip_vs_service_user usvc;
+	struct ip_vs_service_user_kern usvc;
 	int ret;
 
 	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
@@ -2605,7 +2646,7 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 	if (usvc.fwmark)
 		return __ip_vs_svc_fwm_get(usvc.fwmark);
 	else
-		return __ip_vs_service_get(usvc.protocol, usvc.addr,
+		return __ip_vs_service_get(usvc.protocol, usvc.addr.ip,
 					   usvc.port);
 }
 
@@ -2705,7 +2746,7 @@ out_err:
 	return skb->len;
 }
 
-static int ip_vs_genl_parse_dest(struct ip_vs_dest_user *udest,
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 				 struct nlattr *nla, int full_entry)
 {
 	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
@@ -2861,8 +2902,8 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)
 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	struct ip_vs_service *svc = NULL;
-	struct ip_vs_service_user usvc;
-	struct ip_vs_dest_user udest;
+	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_dest_user_kern udest;
 	int ret = 0, cmd;
 	int need_full_svc = 0, need_full_dest = 0;
 
@@ -2914,7 +2955,8 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.protocol, usvc.addr, usvc.port);
+		svc = __ip_vs_service_get(usvc.protocol, usvc.addr.ip,
+					  usvc.port);
 	else
 		svc = __ip_vs_svc_fwm_get(usvc.fwmark);
 
-- 
cgit v1.2.3


From b18610de9ec2728159f723a9b864ca78a5774193 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:37 +0200
Subject: IPVS: Convert __ip_vs_svc_get() and __ip_vs_fwm_get()

Add support for getting services based on their address family to
__ip_vs_service_get(), __ip_vs_fwm_get() and the helper hash function
ip_vs_svc_hashkey(). Adjust the callers.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 79 ++++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index a0c8b7bb553..a2d69b2ce6a 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -282,11 +282,19 @@ static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
  *	Returns hash value for virtual service
  */
 static __inline__ unsigned
-ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
+ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
+		  __be16 port)
 {
 	register unsigned porth = ntohs(port);
+	__be32 addr_fold = addr->ip;
 
-	return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+
+	return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
 		& IP_VS_SVC_TAB_MASK;
 }
 
@@ -317,7 +325,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 		/*
 		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
 		 */
-		hash = ip_vs_svc_hashkey(svc->protocol, svc->addr.ip,
+		hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
 					 svc->port);
 		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
 	} else {
@@ -364,17 +372,19 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
 /*
  *	Get service by {proto,addr,port} in the service table.
  */
-static __inline__ struct ip_vs_service *
-__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
+static inline struct ip_vs_service *
+__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+		    __be16 vport)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
 
 	/* Check for "full" addressed entries */
-	hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
+	hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
 
 	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
-		if ((svc->addr.ip == vaddr)
+		if ((svc->af == af)
+		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
 		    && (svc->port == vport)
 		    && (svc->protocol == protocol)) {
 			/* HIT */
@@ -390,7 +400,8 @@ __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
 /*
  *	Get service by {fwmark} in the service table.
  */
-static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
+static inline struct ip_vs_service *
+__ip_vs_svc_fwm_get(int af, __u32 fwmark)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
@@ -399,7 +410,7 @@ static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
 	hash = ip_vs_svc_fwm_hashkey(fwmark);
 
 	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
-		if (svc->fwmark == fwmark) {
+		if (svc->fwmark == fwmark && svc->af == af) {
 			/* HIT */
 			atomic_inc(&svc->usecnt);
 			return svc;
@@ -413,20 +424,20 @@ struct ip_vs_service *
 ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
 {
 	struct ip_vs_service *svc;
-
+	union nf_inet_addr _vaddr = { .ip = vaddr };
 	read_lock(&__ip_vs_svc_lock);
 
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
+	if (fwmark && (svc = __ip_vs_svc_fwm_get(AF_INET, fwmark)))
 		goto out;
 
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
-	svc = __ip_vs_service_get(protocol, vaddr, vport);
+	svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, vport);
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
@@ -436,7 +447,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
-		svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
+		svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, FTPPORT);
 	}
 
 	if (svc == NULL
@@ -444,7 +455,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
-		svc = __ip_vs_service_get(protocol, vaddr, 0);
+		svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, 0);
 	}
 
   out:
@@ -2016,10 +2027,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.protocol,
-					  usvc.addr.ip, usvc.port);
+		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
+					  &usvc.addr, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_get(usvc.fwmark);
+		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
 
 	if (cmd != IP_VS_SO_SET_ADD
 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2141,13 +2152,15 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 			 struct ip_vs_get_dests __user *uptr)
 {
 	struct ip_vs_service *svc;
+	union nf_inet_addr addr = { .ip = get->addr };
 	int ret = 0;
 
 	if (get->fwmark)
-		svc = __ip_vs_svc_fwm_get(get->fwmark);
+		svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
 	else
-		svc = __ip_vs_service_get(get->protocol,
-					  get->addr, get->port);
+		svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
+					  get->port);
+
 	if (svc) {
 		int count = 0;
 		struct ip_vs_dest *dest;
@@ -2282,13 +2295,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	{
 		struct ip_vs_service_entry *entry;
 		struct ip_vs_service *svc;
+		union nf_inet_addr addr;
 
 		entry = (struct ip_vs_service_entry *)arg;
+		addr.ip = entry->addr;
 		if (entry->fwmark)
-			svc = __ip_vs_svc_fwm_get(entry->fwmark);
+			svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
 		else
-			svc = __ip_vs_service_get(entry->protocol,
-						  entry->addr, entry->port);
+			svc = __ip_vs_service_get(AF_INET, entry->protocol,
+						  &addr, entry->port);
 		if (svc) {
 			ip_vs_copy_service(entry, svc);
 			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2613,10 +2628,10 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 
 		/* prefill flags from service if it already exists */
 		if (usvc->fwmark)
-			svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+			svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
 		else
-			svc = __ip_vs_service_get(usvc->protocol, usvc->addr.ip,
-						  usvc->port);
+			svc = __ip_vs_service_get(usvc->af, usvc->protocol,
+						  &usvc->addr, usvc->port);
 		if (svc) {
 			usvc->flags = svc->flags;
 			ip_vs_service_put(svc);
@@ -2644,10 +2659,10 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 		return ERR_PTR(ret);
 
 	if (usvc.fwmark)
-		return __ip_vs_svc_fwm_get(usvc.fwmark);
+		return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
 	else
-		return __ip_vs_service_get(usvc.protocol, usvc.addr.ip,
-					   usvc.port);
+		return __ip_vs_service_get(usvc.af, usvc.protocol,
+					   &usvc.addr, usvc.port);
 }
 
 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2955,10 +2970,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.protocol, usvc.addr.ip,
-					  usvc.port);
+		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
+					  &usvc.addr, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_get(usvc.fwmark);
+		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
 
 	/* Unless we're adding a new service, the service must already exist */
 	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
-- 
cgit v1.2.3


From 3c2e0505d25cdc9425336f167fd4ff5f505aecff Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:38 +0200
Subject: IPVS: Add v6 support to ip_vs_service_get()

Add support for selecting services based on their address family to
ip_vs_service_get() and adjust the callers.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c       | 24 +++++++++++++-----------
 net/ipv4/ipvs/ip_vs_proto_tcp.c |  9 ++++++---
 net/ipv4/ipvs/ip_vs_proto_udp.c | 11 +++++++----
 3 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index a2d69b2ce6a..1f3fc66e694 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -421,23 +421,24 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
 }
 
 struct ip_vs_service *
-ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
+ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+		  const union nf_inet_addr *vaddr, __be16 vport)
 {
 	struct ip_vs_service *svc;
-	union nf_inet_addr _vaddr = { .ip = vaddr };
+
 	read_lock(&__ip_vs_svc_lock);
 
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_get(AF_INET, fwmark)))
+	if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
 		goto out;
 
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
-	svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, vport);
+	svc = __ip_vs_service_get(af, protocol, vaddr, vport);
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
@@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
-		svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, FTPPORT);
+		svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
 	}
 
 	if (svc == NULL
@@ -455,16 +456,16 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
-		svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, 0);
+		svc = __ip_vs_service_get(af, protocol, vaddr, 0);
 	}
 
   out:
 	read_unlock(&__ip_vs_svc_lock);
 
-	IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
-		  fwmark, ip_vs_proto_name(protocol),
-		  NIPQUAD(vaddr), ntohs(vport),
-		  svc?"hit":"not hit");
+	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
+		      fwmark, ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
+		      svc ? "hit" : "not hit");
 
 	return svc;
 }
@@ -605,8 +606,9 @@ struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_service *svc;
+	union nf_inet_addr _vaddr = { .ip = vaddr };
 
-	svc = ip_vs_service_get(0, protocol, vaddr, vport);
+	svc = ip_vs_service_get(AF_INET, 0, protocol, &_vaddr, vport);
 	if (!svc)
 		return NULL;
 	dest = ip_vs_lookup_dest(svc, daddr, dport);
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 15860e1441b..fe93c9e6ff6 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -74,16 +74,19 @@ tcp_conn_schedule(struct sk_buff *skb,
 {
 	struct ip_vs_service *svc;
 	struct tcphdr _tcph, *th;
+	struct ip_vs_iphdr iph;
 
-	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
+	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+
+	th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
 		*verdict = NF_DROP;
 		return 0;
 	}
 
 	if (th->syn &&
-	    (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol,
-				     ip_hdr(skb)->daddr, th->dest))) {
+	    (svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol,
+				     &iph.daddr, th->dest))) {
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8dfad5db829..d208ed6eb9f 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -80,16 +80,19 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
 {
 	struct ip_vs_service *svc;
 	struct udphdr _udph, *uh;
+	struct ip_vs_iphdr iph;
 
-	uh = skb_header_pointer(skb, ip_hdrlen(skb),
-				sizeof(_udph), &_udph);
+	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+
+	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
 	if (uh == NULL) {
 		*verdict = NF_DROP;
 		return 0;
 	}
 
-	if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol,
-				     ip_hdr(skb)->daddr, uh->dest))) {
+	svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol,
+				&iph.daddr, uh->dest);
+	if (svc) {
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
-- 
cgit v1.2.3


From b14198f6c1bea1687d20723db35d8effecd9d899 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:39 +0200
Subject: IPVS: Add IPv6 support flag to schedulers

Add 'supports_ipv6' flag to struct ip_vs_scheduler to indicate whether a
scheduler supports IPv6. Set the flag to 1 in schedulers that work with
IPv6, 0 otherwise. This flag is checked in a later patch while trying to
add a service with a specific scheduler. Adjust debug in v6-supporting
schedulers to work with both address families.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_dh.c    |  3 +++
 net/ipv4/ipvs/ip_vs_lblc.c  |  3 +++
 net/ipv4/ipvs/ip_vs_lblcr.c |  3 +++
 net/ipv4/ipvs/ip_vs_lc.c    | 11 +++++++----
 net/ipv4/ipvs/ip_vs_nq.c    | 15 +++++++++------
 net/ipv4/ipvs/ip_vs_rr.c    | 13 ++++++++-----
 net/ipv4/ipvs/ip_vs_sed.c   | 15 +++++++++------
 net/ipv4/ipvs/ip_vs_sh.c    |  3 +++
 net/ipv4/ipvs/ip_vs_wlc.c   | 15 +++++++++------
 net/ipv4/ipvs/ip_vs_wrr.c   | 15 +++++++++------
 10 files changed, 63 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index 9f9d795dbd7..a16943fd72f 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -234,6 +234,9 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
 	.init_service =		ip_vs_dh_init_svc,
 	.done_service =		ip_vs_dh_done_svc,
 	.update_service =	ip_vs_dh_update_svc,
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 69309edc0c4..6ecef3518ca 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -522,6 +522,9 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
 	.init_service =		ip_vs_lblc_init_svc,
 	.done_service =		ip_vs_lblc_done_svc,
 	.schedule =		ip_vs_lblc_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 51c746e2083..1f75ea83bcf 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -722,6 +722,9 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
 	.init_service =		ip_vs_lblcr_init_svc,
 	.done_service =		ip_vs_lblcr_done_svc,
 	.schedule =		ip_vs_lblcr_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
index 551d293347f..b69f808ac46 100644
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -67,10 +67,10 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	}
 
 	if (least)
-	IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
-		  NIPQUAD(least->addr.ip), ntohs(least->port),
-		  atomic_read(&least->activeconns),
-		  atomic_read(&least->inactconns));
+	IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->inactconns));
 
 	return least;
 }
@@ -81,6 +81,9 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = {
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
 	.schedule =		ip_vs_lc_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
index aa0e32ad348..9a2d8033f08 100644
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -99,12 +99,12 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		return NULL;
 
   out:
-	IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
-		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr.ip), ntohs(least->port),
-		  atomic_read(&least->activeconns),
-		  atomic_read(&least->refcnt),
-		  atomic_read(&least->weight), loh);
+	IP_VS_DBG_BUF(6, "NQ: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
 
 	return least;
 }
@@ -116,6 +116,9 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
 	.schedule =		ip_vs_nq_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
index 27f0b624283..a22195f68ac 100644
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -74,11 +74,11 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
   out:
 	svc->sched_data = q;
 	write_unlock(&svc->sched_lock);
-	IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
-		  "activeconns %d refcnt %d weight %d\n",
-		  NIPQUAD(dest->addr.ip), ntohs(dest->port),
-		  atomic_read(&dest->activeconns),
-		  atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+	IP_VS_DBG_BUF(6, "RR: server %s:%u "
+		      "activeconns %d refcnt %d weight %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+		      atomic_read(&dest->activeconns),
+		      atomic_read(&dest->refcnt), atomic_read(&dest->weight));
 
 	return dest;
 }
@@ -89,6 +89,9 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
 	.init_service =		ip_vs_rr_init_svc,
 	.update_service =	ip_vs_rr_update_svc,
 	.schedule =		ip_vs_rr_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
index 38b574b2fdf..7d2f22f04b8 100644
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -101,12 +101,12 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		}
 	}
 
-	IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
-		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr.ip), ntohs(least->port),
-		  atomic_read(&least->activeconns),
-		  atomic_read(&least->refcnt),
-		  atomic_read(&least->weight), loh);
+	IP_VS_DBG_BUF(6, "SED: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
 
 	return least;
 }
@@ -118,6 +118,9 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
 	.schedule =		ip_vs_sed_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index c9e54e2ec29..1d96de27fef 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -231,6 +231,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list	 =		LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
 	.init_service =		ip_vs_sh_init_svc,
 	.done_service =		ip_vs_sh_done_svc,
 	.update_service =	ip_vs_sh_update_svc,
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
index 09fd993040f..8c596e71259 100644
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -89,12 +89,12 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		}
 	}
 
-	IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
-		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr.ip), ntohs(least->port),
-		  atomic_read(&least->activeconns),
-		  atomic_read(&least->refcnt),
-		  atomic_read(&least->weight), loh);
+	IP_VS_DBG_BUF(6, "WLC: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
 
 	return least;
 }
@@ -106,6 +106,9 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler =
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
 	.schedule =		ip_vs_wlc_schedule,
 };
 
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
index 19c49b234f3..7ea92fed50b 100644
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -195,12 +195,12 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		}
 	}
 
-	IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
-		  "activeconns %d refcnt %d weight %d\n",
-		  NIPQUAD(dest->addr.ip), ntohs(dest->port),
-		  atomic_read(&dest->activeconns),
-		  atomic_read(&dest->refcnt),
-		  atomic_read(&dest->weight));
+	IP_VS_DBG_BUF(6, "WRR: server %s:%u "
+		      "activeconns %d refcnt %d weight %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+		      atomic_read(&dest->activeconns),
+		      atomic_read(&dest->refcnt),
+		      atomic_read(&dest->weight));
 
   out:
 	write_unlock(&svc->sched_lock);
@@ -213,6 +213,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
 	.refcnt =		ATOMIC_INIT(0),
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
 	.init_service =		ip_vs_wrr_init_svc,
 	.done_service =		ip_vs_wrr_done_svc,
 	.update_service =	ip_vs_wrr_update_svc,
-- 
cgit v1.2.3


From 51ef348b14183789e4cb3444d05ce83b1b69d8fb Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:40 +0200
Subject: IPVS: Add 'af' args to protocol handler functions

Add 'af' arguments to conn_schedule(), conn_in_get(), conn_out_get() and
csum_check() function pointers in struct ip_vs_protocol. Extend the
respective functions for TCP, UDP, AH and ESP and adjust the callers.

The changes in the callers need to be somewhat extensive, since they now
need to pass a filled out struct ip_vs_iphdr * to the modified functions
instead of a struct iphdr *.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c         | 64 +++++++++++++++---------------
 net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 56 +++++++++++++-------------
 net/ipv4/ipvs/ip_vs_proto_tcp.c    | 79 ++++++++++++++++++++++++-------------
 net/ipv4/ipvs/ip_vs_proto_udp.c    | 81 ++++++++++++++++++++++++--------------
 4 files changed, 162 insertions(+), 118 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 4a54f33b60d..34aaa1480d9 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -572,6 +572,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 	struct iphdr *iph;
 	struct icmphdr	_icmph, *ic;
 	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_iphdr ciph;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	unsigned int offset, ihl, verdict;
@@ -627,8 +628,9 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 
 	offset += cih->ihl * 4;
 
+	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_out_get(skb, pp, cih, offset, 1);
+	cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
 	if (!cp)
 		return NF_ACCEPT;
 
@@ -686,43 +688,41 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	  const struct net_device *in, const struct net_device *out,
 	  int (*okfn)(struct sk_buff *))
 {
-	struct iphdr	*iph;
+	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int ihl;
 
 	EnterFunction(11);
 
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
-	iph = ip_hdr(skb);
-	if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 		int related, verdict = ip_vs_out_icmp(skb, &related);
 
 		if (related)
 			return verdict;
-		iph = ip_hdr(skb);
+		ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
 	}
 
-	pp = ip_vs_proto_get(iph->protocol);
+	pp = ip_vs_proto_get(iph.protocol);
 	if (unlikely(!pp))
 		return NF_ACCEPT;
 
 	/* reassemble IP fragments */
-	if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) &&
+	if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
 		     !pp->dont_defrag)) {
 		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 			return NF_STOLEN;
-		iph = ip_hdr(skb);
-	}
 
-	ihl = iph->ihl << 2;
+		ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+	}
 
 	/*
 	 * Check if the packet belongs to an existing entry
 	 */
-	cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
+	cp = pp->conn_out_get(AF_INET, skb, pp, &iph, iph.len, 0);
 
 	if (unlikely(!cp)) {
 		if (sysctl_ip_vs_nat_icmp_send &&
@@ -730,18 +730,18 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 		     pp->protocol == IPPROTO_UDP)) {
 			__be16 _ports[2], *pptr;
 
-			pptr = skb_header_pointer(skb, ihl,
+			pptr = skb_header_pointer(skb, iph.len,
 						  sizeof(_ports), _ports);
 			if (pptr == NULL)
 				return NF_ACCEPT;	/* Not for me */
-			if (ip_vs_lookup_real_service(iph->protocol,
-						      iph->saddr, pptr[0])) {
+			if (ip_vs_lookup_real_service(iph.protocol,
+						      iph.saddr.ip, pptr[0])) {
 				/*
 				 * Notify the real server: there is no
 				 * existing entry if it is not RST
 				 * packet or not TCP packet.
 				 */
-				if (iph->protocol != IPPROTO_TCP
+				if (iph.protocol != IPPROTO_TCP
 				    || !is_tcp_reset(skb)) {
 					icmp_send(skb,ICMP_DEST_UNREACH,
 						  ICMP_PORT_UNREACH, 0);
@@ -756,7 +756,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 
 	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
 
-	if (!skb_make_writable(skb, ihl))
+	if (!skb_make_writable(skb, iph.len))
 		goto drop;
 
 	/* mangle the packet */
@@ -804,6 +804,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	struct iphdr *iph;
 	struct icmphdr	_icmph, *ic;
 	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_iphdr ciph;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	unsigned int offset, ihl, verdict;
@@ -860,8 +861,9 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	offset += cih->ihl * 4;
 
+	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_in_get(skb, pp, cih, offset, 1);
+	cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
 	if (!cp)
 		return NF_ACCEPT;
 
@@ -897,11 +899,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	 const struct net_device *in, const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	struct iphdr	*iph;
+	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
 	int ret, restart;
-	int ihl;
+
+	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
 
 	/*
 	 *	Big tappo: only PACKET_HOST (neither loopback nor mcasts)
@@ -909,38 +912,35 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	 */
 	if (unlikely(skb->pkt_type != PACKET_HOST
 		     || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
-		IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
-			  skb->pkt_type,
-			  ip_hdr(skb)->protocol,
-			  NIPQUAD(ip_hdr(skb)->daddr));
+		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
+			      skb->pkt_type,
+			      iph.protocol,
+			      IP_VS_DBG_ADDR(AF_INET, &iph.daddr));
 		return NF_ACCEPT;
 	}
 
-	iph = ip_hdr(skb);
-	if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 		int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
 
 		if (related)
 			return verdict;
-		iph = ip_hdr(skb);
+		ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
 	}
 
 	/* Protocol supported? */
-	pp = ip_vs_proto_get(iph->protocol);
+	pp = ip_vs_proto_get(iph.protocol);
 	if (unlikely(!pp))
 		return NF_ACCEPT;
 
-	ihl = iph->ihl << 2;
-
 	/*
 	 * Check if the packet belongs to an existing connection entry
 	 */
-	cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
+	cp = pp->conn_in_get(AF_INET, skb, pp, &iph, iph.len, 0);
 
 	if (unlikely(!cp)) {
 		int v;
 
-		if (!pp->conn_schedule(skb, pp, &v, &cp))
+		if (!pp->conn_schedule(AF_INET, skb, pp, &v, &cp))
 			return v;
 	}
 
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
index 3f9ebd7639a..2a361a99174 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
@@ -39,25 +39,23 @@ struct isakmp_hdr {
 
 
 static struct ip_vs_conn *
-ah_esp_conn_in_get(const struct sk_buff *skb,
-		   struct ip_vs_protocol *pp,
-		   const struct iphdr *iph,
-		   unsigned int proto_off,
+ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		   const struct ip_vs_iphdr *iph, unsigned int proto_off,
 		   int inverse)
 {
 	struct ip_vs_conn *cp;
 
 	if (likely(!inverse)) {
 		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->saddr,
+				       iph->saddr.ip,
 				       htons(PORT_ISAKMP),
-				       iph->daddr,
+				       iph->daddr.ip,
 				       htons(PORT_ISAKMP));
 	} else {
 		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->daddr,
+				       iph->daddr.ip,
 				       htons(PORT_ISAKMP),
-				       iph->saddr,
+				       iph->saddr.ip,
 				       htons(PORT_ISAKMP));
 	}
 
@@ -66,12 +64,12 @@ ah_esp_conn_in_get(const struct sk_buff *skb,
 		 * We are not sure if the packet is from our
 		 * service, so our conn_schedule hook should return NF_ACCEPT
 		 */
-		IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
-			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
-			  inverse ? "ICMP+" : "",
-			  pp->name,
-			  NIPQUAD(iph->saddr),
-			  NIPQUAD(iph->daddr));
+		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
+			      "%s%s %s->%s\n",
+			      inverse ? "ICMP+" : "",
+			      pp->name,
+			      IP_VS_DBG_ADDR(af, &iph->saddr),
+			      IP_VS_DBG_ADDR(af, &iph->daddr));
 	}
 
 	return cp;
@@ -79,32 +77,35 @@ ah_esp_conn_in_get(const struct sk_buff *skb,
 
 
 static struct ip_vs_conn *
-ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		    const struct iphdr *iph, unsigned int proto_off, int inverse)
+ah_esp_conn_out_get(int af, const struct sk_buff *skb,
+		    struct ip_vs_protocol *pp,
+		    const struct ip_vs_iphdr *iph,
+		    unsigned int proto_off,
+		    int inverse)
 {
 	struct ip_vs_conn *cp;
 
 	if (likely(!inverse)) {
 		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->saddr,
+					iph->saddr.ip,
 					htons(PORT_ISAKMP),
-					iph->daddr,
+					iph->daddr.ip,
 					htons(PORT_ISAKMP));
 	} else {
 		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->daddr,
+					iph->daddr.ip,
 					htons(PORT_ISAKMP),
-					iph->saddr,
+					iph->saddr.ip,
 					htons(PORT_ISAKMP));
 	}
 
 	if (!cp) {
-		IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
-			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
-			  inverse ? "ICMP+" : "",
-			  pp->name,
-			  NIPQUAD(iph->saddr),
-			  NIPQUAD(iph->daddr));
+		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
+			      "%s%s %s->%s\n",
+			      inverse ? "ICMP+" : "",
+			      pp->name,
+			      IP_VS_DBG_ADDR(af, &iph->saddr),
+			      IP_VS_DBG_ADDR(af, &iph->daddr));
 	}
 
 	return cp;
@@ -112,8 +113,7 @@ ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 
 static int
-ah_esp_conn_schedule(struct sk_buff *skb,
-		     struct ip_vs_protocol *pp,
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		     int *verdict, struct ip_vs_conn **cpp)
 {
 	/*
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index fe93c9e6ff6..9211afa8f30 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -25,8 +25,9 @@
 
 
 static struct ip_vs_conn *
-tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		const struct iphdr *iph, unsigned int proto_off, int inverse)
+tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		int inverse)
 {
 	__be16 _ports[2], *pptr;
 
@@ -36,18 +37,19 @@ tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 	if (likely(!inverse)) {
 		return ip_vs_conn_in_get(iph->protocol,
-					 iph->saddr, pptr[0],
-					 iph->daddr, pptr[1]);
+					 iph->saddr.ip, pptr[0],
+					 iph->daddr.ip, pptr[1]);
 	} else {
 		return ip_vs_conn_in_get(iph->protocol,
-					 iph->daddr, pptr[1],
-					 iph->saddr, pptr[0]);
+					 iph->daddr.ip, pptr[1],
+					 iph->saddr.ip, pptr[0]);
 	}
 }
 
 static struct ip_vs_conn *
-tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		 const struct iphdr *iph, unsigned int proto_off, int inverse)
+tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		 int inverse)
 {
 	__be16 _ports[2], *pptr;
 
@@ -57,26 +59,25 @@ tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 	if (likely(!inverse)) {
 		return ip_vs_conn_out_get(iph->protocol,
-					  iph->saddr, pptr[0],
-					  iph->daddr, pptr[1]);
+					  iph->saddr.ip, pptr[0],
+					  iph->daddr.ip, pptr[1]);
 	} else {
 		return ip_vs_conn_out_get(iph->protocol,
-					  iph->daddr, pptr[1],
-					  iph->saddr, pptr[0]);
+					  iph->daddr.ip, pptr[1],
+					  iph->saddr.ip, pptr[0]);
 	}
 }
 
 
 static int
-tcp_conn_schedule(struct sk_buff *skb,
-		  struct ip_vs_protocol *pp,
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		  int *verdict, struct ip_vs_conn **cpp)
 {
 	struct ip_vs_service *svc;
 	struct tcphdr _tcph, *th;
 	struct ip_vs_iphdr iph;
 
-	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 
 	th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
@@ -85,8 +86,8 @@ tcp_conn_schedule(struct sk_buff *skb,
 	}
 
 	if (th->syn &&
-	    (svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol,
-				     &iph.daddr, th->dest))) {
+	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
+				     th->dest))) {
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -136,7 +137,7 @@ tcp_snat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(skb, pp))
+		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
 			return 0;
 
 		/* Call application helper if needed */
@@ -182,7 +183,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(skb, pp))
+		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
 			return 0;
 
 		/*
@@ -219,21 +220,43 @@ tcp_dnat_handler(struct sk_buff *skb,
 
 
 static int
-tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 {
-	const unsigned int tcphoff = ip_hdrlen(skb);
+	unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_NONE:
 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 	case CHECKSUM_COMPLETE:
-		if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
-				      skb->len - tcphoff,
-				      ip_hdr(skb)->protocol, skb->csum)) {
-			IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-					 "Failed checksum for");
-			return 0;
-		}
+#ifdef CONFIG_IP_VS_IPV6
+		if (af == AF_INET6) {
+			if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					    &ipv6_hdr(skb)->daddr,
+					    skb->len - tcphoff,
+					    ipv6_hdr(skb)->nexthdr,
+					    skb->csum)) {
+				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+						 "Failed checksum for");
+				return 0;
+			}
+		} else
+#endif
+			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					      ip_hdr(skb)->daddr,
+					      skb->len - tcphoff,
+					      ip_hdr(skb)->protocol,
+					      skb->csum)) {
+				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+						 "Failed checksum for");
+				return 0;
+			}
 		break;
 	default:
 		/* No need to checksum. */
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index d208ed6eb9f..d3a1b1f2d10 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -24,8 +24,9 @@
 #include <net/ip.h>
 
 static struct ip_vs_conn *
-udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		const struct iphdr *iph, unsigned int proto_off, int inverse)
+udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		int inverse)
 {
 	struct ip_vs_conn *cp;
 	__be16 _ports[2], *pptr;
@@ -36,12 +37,12 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 	if (likely(!inverse)) {
 		cp = ip_vs_conn_in_get(iph->protocol,
-				       iph->saddr, pptr[0],
-				       iph->daddr, pptr[1]);
+				       iph->saddr.ip, pptr[0],
+				       iph->daddr.ip, pptr[1]);
 	} else {
 		cp = ip_vs_conn_in_get(iph->protocol,
-				       iph->daddr, pptr[1],
-				       iph->saddr, pptr[0]);
+				       iph->daddr.ip, pptr[1],
+				       iph->saddr.ip, pptr[0]);
 	}
 
 	return cp;
@@ -49,25 +50,25 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 
 static struct ip_vs_conn *
-udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		 const struct iphdr *iph, unsigned int proto_off, int inverse)
+udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		 int inverse)
 {
 	struct ip_vs_conn *cp;
 	__be16 _ports[2], *pptr;
 
-	pptr = skb_header_pointer(skb, ip_hdrlen(skb),
-				  sizeof(_ports), _ports);
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return NULL;
 
 	if (likely(!inverse)) {
 		cp = ip_vs_conn_out_get(iph->protocol,
-					iph->saddr, pptr[0],
-					iph->daddr, pptr[1]);
+					iph->saddr.ip, pptr[0],
+					iph->daddr.ip, pptr[1]);
 	} else {
 		cp = ip_vs_conn_out_get(iph->protocol,
-					iph->daddr, pptr[1],
-					iph->saddr, pptr[0]);
+					iph->daddr.ip, pptr[1],
+					iph->saddr.ip, pptr[0]);
 	}
 
 	return cp;
@@ -75,14 +76,14 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 
 static int
-udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		  int *verdict, struct ip_vs_conn **cpp)
 {
 	struct ip_vs_service *svc;
 	struct udphdr _udph, *uh;
 	struct ip_vs_iphdr iph;
 
-	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 
 	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
 	if (uh == NULL) {
@@ -90,7 +91,7 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return 0;
 	}
 
-	svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol,
+	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
 				&iph.daddr, uh->dest);
 	if (svc) {
 		if (ip_vs_todrop()) {
@@ -143,7 +144,7 @@ udp_snat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(skb, pp))
+		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
 			return 0;
 
 		/*
@@ -195,7 +196,7 @@ udp_dnat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(skb, pp))
+		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
 			return 0;
 
 		/*
@@ -234,10 +235,17 @@ udp_dnat_handler(struct sk_buff *skb,
 
 
 static int
-udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 {
 	struct udphdr _udph, *uh;
-	const unsigned int udphoff = ip_hdrlen(skb);
+	unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
 
 	uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
 	if (uh == NULL)
@@ -249,15 +257,28 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
 			skb->csum = skb_checksum(skb, udphoff,
 						 skb->len - udphoff, 0);
 		case CHECKSUM_COMPLETE:
-			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
-					      ip_hdr(skb)->daddr,
-					      skb->len - udphoff,
-					      ip_hdr(skb)->protocol,
-					      skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-						 "Failed checksum for");
-				return 0;
-			}
+#ifdef CONFIG_IP_VS_IPV6
+			if (af == AF_INET6) {
+				if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+						    &ipv6_hdr(skb)->daddr,
+						    skb->len - udphoff,
+						    ipv6_hdr(skb)->nexthdr,
+						    skb->csum)) {
+					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+							 "Failed checksum for");
+					return 0;
+				}
+			} else
+#endif
+				if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+						      ip_hdr(skb)->daddr,
+						      skb->len - udphoff,
+						      ip_hdr(skb)->protocol,
+						      skb->csum)) {
+					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+							 "Failed checksum for");
+					return 0;
+				}
 			break;
 		default:
 			/* No need to checksum. */
-- 
cgit v1.2.3


From 3b047d9d0407e78a52f009835a0e26cb62edb8c7 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:41 +0200
Subject: IPVS: Add protocol debug functions for IPv6

Add protocol (TCP, UDP, AH, ESP) debug functions for IPv6 packet debug
output.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_proto.c        | 63 +++++++++++++++++++++++++++++++++++---
 net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 36 ++++++++++++++++++++--
 2 files changed, 93 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
index 6099a88fc20..50f6215beda 100644
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -152,10 +152,10 @@ const char * ip_vs_state_name(__u16 proto, int state)
 
 
 void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
-			  const struct sk_buff *skb,
-			  int offset,
-			  const char *msg)
+ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
+			     const struct sk_buff *skb,
+			     int offset,
+			     const char *msg)
 {
 	char buf[128];
 	struct iphdr _iph, *ih;
@@ -189,6 +189,61 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
 	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+void
+ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
+			     const struct sk_buff *skb,
+			     int offset,
+			     const char *msg)
+{
+	char buf[192];
+	struct ipv6hdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else if (ih->nexthdr == IPPROTO_FRAGMENT)
+		sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
+			pp->name, NIP6(ih->saddr),
+			NIP6(ih->daddr));
+	else {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
+				pp->name,
+				NIP6(ih->saddr),
+				NIP6(ih->daddr));
+		else
+			sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
+				pp->name,
+				NIP6(ih->saddr),
+				ntohs(pptr[0]),
+				NIP6(ih->daddr),
+				ntohs(pptr[1]));
+	}
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+#endif
+
+
+void
+ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+			  const struct sk_buff *skb,
+			  int offset,
+			  const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (skb->protocol == __constant_htons(ETH_P_IPV6))
+		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
+	else
+#endif
+		ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
+}
+
 
 int __init ip_vs_protocol_init(void)
 {
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
index 2a361a99174..4b0b8f268d1 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
@@ -125,8 +125,8 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 
 static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		    int offset, const char *msg)
+ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		       int offset, const char *msg)
 {
 	char buf[256];
 	struct iphdr _iph, *ih;
@@ -142,6 +142,38 @@ ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		       int offset, const char *msg)
+{
+	char buf[256];
+	struct ipv6hdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else
+		sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
+			pp->name, NIP6(ih->saddr),
+			NIP6(ih->daddr));
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+#endif
+
+static void
+ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		    int offset, const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (skb->protocol == __constant_htons(ETH_P_IPV6))
+		ah_esp_debug_packet_v6(pp, skb, offset, msg);
+	else
+#endif
+		ah_esp_debug_packet_v4(pp, skb, offset, msg);
+}
+
 
 static void ah_esp_init(struct ip_vs_protocol *pp)
 {
-- 
cgit v1.2.3


From 0bbdd42b7efa66685b6d74701bcde3a596a3a59d Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:42 +0200
Subject: IPVS: Extend protocol DNAT/SNAT and state handlers

Extend protocol DNAT/SNAT and state handlers to work with IPv6. Also
change/introduce new checksumming helper functions for this.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_proto_tcp.c | 85 ++++++++++++++++++++++++++++++++---------
 net/ipv4/ipvs/ip_vs_proto_udp.c | 82 ++++++++++++++++++++++++++++++---------
 2 files changed, 131 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 9211afa8f30..3daae43ae44 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -114,11 +114,21 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 
 static inline void
-tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip,
+tcp_fast_csum_update(int af, struct tcphdr *tcph,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
 		     __be16 oldport, __be16 newport)
 {
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcph->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(tcph->check))));
+	else
+#endif
 	tcph->check =
-		csum_fold(ip_vs_check_diff4(oldip, newip,
+		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 				 ip_vs_check_diff2(oldport, newport,
 						~csum_unfold(tcph->check))));
 }
@@ -129,7 +139,14 @@ tcp_snat_handler(struct sk_buff *skb,
 		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
 	struct tcphdr *tcph;
-	const unsigned int tcphoff = ip_hdrlen(skb);
+	unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
@@ -137,7 +154,7 @@ tcp_snat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
 
 		/* Call application helper if needed */
@@ -145,13 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
 			return 0;
 	}
 
-	tcph = (void *)ip_hdr(skb) + tcphoff;
+	tcph = (void *)skb_network_header(skb) + tcphoff;
 	tcph->source = cp->vport;
 
 	/* Adjust TCP checksums */
 	if (!cp->app) {
 		/* Only port and addr are changed, do fast csum update */
-		tcp_fast_csum_update(tcph, cp->daddr.ip, cp->vaddr.ip,
+		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -159,9 +176,20 @@ tcp_snat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		tcph->check = 0;
 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-		tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip,
-						skb->len - tcphoff,
-						cp->protocol, skb->csum);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
+						      &cp->caddr.in6,
+						      skb->len - tcphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
+							cp->caddr.ip,
+							skb->len - tcphoff,
+							cp->protocol,
+							skb->csum);
+
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 			  pp->name, tcph->check,
 			  (char*)&(tcph->check) - (char*)tcph);
@@ -175,7 +203,14 @@ tcp_dnat_handler(struct sk_buff *skb,
 		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
 	struct tcphdr *tcph;
-	const unsigned int tcphoff = ip_hdrlen(skb);
+	unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
@@ -183,7 +218,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
 
 		/*
@@ -194,7 +229,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 			return 0;
 	}
 
-	tcph = (void *)ip_hdr(skb) + tcphoff;
+	tcph = (void *)skb_network_header(skb) + tcphoff;
 	tcph->dest = cp->dport;
 
 	/*
@@ -202,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	 */
 	if (!cp->app) {
 		/* Only port and addr are changed, do fast csum update */
-		tcp_fast_csum_update(tcph, cp->vaddr.ip, cp->daddr.ip,
+		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -210,9 +245,19 @@ tcp_dnat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		tcph->check = 0;
 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-		tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip,
-						skb->len - tcphoff,
-						cp->protocol, skb->csum);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			tcph->check = csum_ipv6_magic(&cp->caddr.in6,
+						      &cp->daddr.in6,
+						      skb->len - tcphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			tcph->check = csum_tcpudp_magic(cp->caddr.ip,
+							cp->daddr.ip,
+							skb->len - tcphoff,
+							cp->protocol,
+							skb->csum);
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 	return 1;
@@ -487,7 +532,13 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 {
 	struct tcphdr _tcph, *th;
 
-	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
+#ifdef CONFIG_IP_VS_IPV6
+	int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+	int ihl = ip_hdrlen(skb);
+#endif
+
+	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
 	if (th == NULL)
 		return 0;
 
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index d3a1b1f2d10..6cca0ad8e32 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -120,13 +120,23 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 
 static inline void
-udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip,
+udp_fast_csum_update(int af, struct udphdr *uhdr,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
 		     __be16 oldport, __be16 newport)
 {
-	uhdr->check =
-		csum_fold(ip_vs_check_diff4(oldip, newip,
-				 ip_vs_check_diff2(oldport, newport,
-					~csum_unfold(uhdr->check))));
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		uhdr->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(uhdr->check))));
+	else
+#endif
+		uhdr->check =
+			csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(uhdr->check))));
 	if (!uhdr->check)
 		uhdr->check = CSUM_MANGLED_0;
 }
@@ -136,7 +146,14 @@ udp_snat_handler(struct sk_buff *skb,
 		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
 	struct udphdr *udph;
-	const unsigned int udphoff = ip_hdrlen(skb);
+	unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
@@ -144,7 +161,7 @@ udp_snat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
 
 		/*
@@ -154,7 +171,7 @@ udp_snat_handler(struct sk_buff *skb,
 			return 0;
 	}
 
-	udph = (void *)ip_hdr(skb) + udphoff;
+	udph = (void *)skb_network_header(skb) + udphoff;
 	udph->source = cp->vport;
 
 	/*
@@ -162,7 +179,7 @@ udp_snat_handler(struct sk_buff *skb,
 	 */
 	if (!cp->app && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
-		udp_fast_csum_update(udph, cp->daddr.ip, cp->vaddr.ip,
+		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -170,9 +187,19 @@ udp_snat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		udph->check = 0;
 		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-		udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip,
-						skb->len - udphoff,
-						cp->protocol, skb->csum);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			udph->check = csum_ipv6_magic(&cp->vaddr.in6,
+						      &cp->caddr.in6,
+						      skb->len - udphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			udph->check = csum_tcpudp_magic(cp->vaddr.ip,
+							cp->caddr.ip,
+							skb->len - udphoff,
+							cp->protocol,
+							skb->csum);
 		if (udph->check == 0)
 			udph->check = CSUM_MANGLED_0;
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
@@ -188,7 +215,14 @@ udp_dnat_handler(struct sk_buff *skb,
 		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
 	struct udphdr *udph;
-	unsigned int udphoff = ip_hdrlen(skb);
+	unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
@@ -196,7 +230,7 @@ udp_dnat_handler(struct sk_buff *skb,
 
 	if (unlikely(cp->app != NULL)) {
 		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp))
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
 
 		/*
@@ -207,7 +241,7 @@ udp_dnat_handler(struct sk_buff *skb,
 			return 0;
 	}
 
-	udph = (void *)ip_hdr(skb) + udphoff;
+	udph = (void *)skb_network_header(skb) + udphoff;
 	udph->dest = cp->dport;
 
 	/*
@@ -215,7 +249,7 @@ udp_dnat_handler(struct sk_buff *skb,
 	 */
 	if (!cp->app && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
-		udp_fast_csum_update(udph, cp->vaddr.ip, cp->daddr.ip,
+		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -223,9 +257,19 @@ udp_dnat_handler(struct sk_buff *skb,
 		/* full checksum calculation */
 		udph->check = 0;
 		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-		udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip,
-						skb->len - udphoff,
-						cp->protocol, skb->csum);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			udph->check = csum_ipv6_magic(&cp->caddr.in6,
+						      &cp->daddr.in6,
+						      skb->len - udphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			udph->check = csum_tcpudp_magic(cp->caddr.ip,
+							cp->daddr.ip,
+							skb->len - udphoff,
+							cp->protocol,
+							skb->csum);
 		if (udph->check == 0)
 			udph->check = CSUM_MANGLED_0;
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-- 
cgit v1.2.3


From 28364a59f3dfe7fed3560ec7aff9b7aeb02824fb Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:43 +0200
Subject: IPVS: Extend functions for getting/creating connections

Extend functions for getting/creating connections and connection
templates for IPv6 support and fix the callers.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_conn.c         | 100 ++++++++++++++++++++-------------
 net/ipv4/ipvs/ip_vs_core.c         | 112 ++++++++++++++++++++-----------------
 net/ipv4/ipvs/ip_vs_ftp.c          |  45 +++++++--------
 net/ipv4/ipvs/ip_vs_proto_ah_esp.c |  24 ++++----
 net/ipv4/ipvs/ip_vs_proto_tcp.c    |  24 ++++----
 net/ipv4/ipvs/ip_vs_proto_udp.c    |  24 ++++----
 net/ipv4/ipvs/ip_vs_sync.c         |  27 +++++----
 7 files changed, 198 insertions(+), 158 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 639d4bc7fc1..15eec282b17 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -114,9 +114,18 @@ static inline void ct_write_unlock_bh(unsigned key)
 /*
  *	Returns hash value for IPVS connection entry
  */
-static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port)
+static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+				       const union nf_inet_addr *addr,
+				       __be16 port)
 {
-	return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd)
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+				    (__force u32)port, proto, ip_vs_conn_rnd)
+			& IP_VS_CONN_TAB_MASK;
+#endif
+	return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+			    ip_vs_conn_rnd)
 		& IP_VS_CONN_TAB_MASK;
 }
 
@@ -131,7 +140,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	int ret;
 
 	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport);
+	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
 
 	ct_write_lock(hash);
 
@@ -162,7 +171,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	int ret;
 
 	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport);
+	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
 
 	ct_write_lock(hash);
 
@@ -187,18 +196,21 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
  *	d_addr, d_port: pkt dest address (load balancer)
  */
 static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (s_addr == cp->caddr.ip && s_port == cp->cport &&
-		    d_port == cp->vport && d_addr == cp->vaddr.ip &&
+		if (cp->af == af &&
+		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
+		    s_port == cp->cport && d_port == cp->vport &&
 		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
 		    protocol == cp->protocol) {
 			/* HIT */
@@ -214,37 +226,42 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
 }
 
 struct ip_vs_conn *ip_vs_conn_in_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
 {
 	struct ip_vs_conn *cp;
 
-	cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
+	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
 	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-		cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
+		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
+					 d_port);
 
-	IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
-		  ip_vs_proto_name(protocol),
-		  NIPQUAD(s_addr), ntohs(s_port),
-		  NIPQUAD(d_addr), ntohs(d_port),
-		  cp?"hit":"not hit");
+	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
 /* Get reference to connection template */
 struct ip_vs_conn *ip_vs_ct_in_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (s_addr == cp->caddr.ip && s_port == cp->cport &&
-		    d_port == cp->vport && d_addr == cp->vaddr.ip &&
+		if (cp->af == af &&
+		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
+		    s_port == cp->cport && d_port == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
 		    protocol == cp->protocol) {
 			/* HIT */
@@ -257,11 +274,11 @@ struct ip_vs_conn *ip_vs_ct_in_get
   out:
 	ct_read_unlock(hash);
 
-	IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
-		  ip_vs_proto_name(protocol),
-		  NIPQUAD(s_addr), ntohs(s_port),
-		  NIPQUAD(d_addr), ntohs(d_port),
-		  cp?"hit":"not hit");
+	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      cp ? "hit" : "not hit");
 
 	return cp;
 }
@@ -273,7 +290,8 @@ struct ip_vs_conn *ip_vs_ct_in_get
  *	d_addr, d_port: pkt dest address (foreign host)
  */
 struct ip_vs_conn *ip_vs_conn_out_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp, *ret=NULL;
@@ -281,13 +299,15 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	/*
 	 *	Check for "full" addressed entries
 	 */
-	hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
+	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (d_addr == cp->caddr.ip && d_port == cp->cport &&
-		    s_port == cp->dport && s_addr == cp->daddr.ip &&
+		if (cp->af == af &&
+		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
+		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
+		    d_port == cp->cport && s_port == cp->dport &&
 		    protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
@@ -298,11 +318,11 @@ struct ip_vs_conn *ip_vs_conn_out_get
 
 	ct_read_unlock(hash);
 
-	IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
-		  ip_vs_proto_name(protocol),
-		  NIPQUAD(s_addr), ntohs(s_port),
-		  NIPQUAD(d_addr), ntohs(d_port),
-		  ret?"hit":"not hit");
+	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ret ? "hit" : "not hit");
 
 	return ret;
 }
@@ -625,8 +645,9 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
  *	Create a new connection entry and hash it into the ip_vs_conn_tab
  */
 struct ip_vs_conn *
-ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport,
-	       __be32 daddr, __be16 dport, unsigned flags,
+ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
+	       const union nf_inet_addr *vaddr, __be16 vport,
+	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
 	       struct ip_vs_dest *dest)
 {
 	struct ip_vs_conn *cp;
@@ -640,12 +661,13 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport
 
 	INIT_LIST_HEAD(&cp->c_list);
 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+	cp->af		   = af;
 	cp->protocol	   = proto;
-	cp->caddr.ip	   = caddr;
+	ip_vs_addr_copy(af, &cp->caddr, caddr);
 	cp->cport	   = cport;
-	cp->vaddr.ip	   = vaddr;
+	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
 	cp->vport	   = vport;
-	cp->daddr.ip	   = daddr;
+	ip_vs_addr_copy(af, &cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
 	spin_lock_init(&cp->lock);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 34aaa1480d9..2d5a4331709 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -173,19 +173,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		    __be16 ports[2])
 {
 	struct ip_vs_conn *cp = NULL;
-	struct iphdr *iph = ip_hdr(skb);
+	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
 	__be16  dport;	 /* destination port to forward */
-	__be32  snet;	 /* source network of the client, after masking */
+	union nf_inet_addr snet;	/* source network of the client,
+					   after masking */
+	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
 
 	/* Mask saddr with the netmask to adjust template granularity */
-	snet = iph->saddr & svc->netmask;
+	snet.ip = iph.saddr.ip & svc->netmask;
 
 	IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
 		  "mnet %u.%u.%u.%u\n",
-		  NIPQUAD(iph->saddr), ntohs(ports[0]),
-		  NIPQUAD(iph->daddr), ntohs(ports[1]),
+		  NIPQUAD(iph.saddr.ip), ntohs(ports[0]),
+		  NIPQUAD(iph.daddr.ip), ntohs(ports[1]),
 		  NIPQUAD(snet));
 
 	/*
@@ -204,11 +206,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	if (ports[1] == svc->port) {
 		/* Check if a template already exists */
 		if (svc->port != FTPPORT)
-			ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
-					       iph->daddr, ports[1]);
+			ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0,
+					     &iph.daddr, ports[1]);
 		else
-			ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
-					       iph->daddr, 0);
+			ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0,
+					     &iph.daddr, 0);
 
 		if (!ct || !ip_vs_check_template(ct)) {
 			/*
@@ -228,18 +230,18 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 			 * for ftp service.
 			 */
 			if (svc->port != FTPPORT)
-				ct = ip_vs_conn_new(iph->protocol,
-						    snet, 0,
-						    iph->daddr,
+				ct = ip_vs_conn_new(AF_INET, iph.protocol,
+						    &snet, 0,
+						    &iph.daddr,
 						    ports[1],
-						    dest->addr.ip, dest->port,
+						    &dest->addr, dest->port,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			else
-				ct = ip_vs_conn_new(iph->protocol,
-						    snet, 0,
-						    iph->daddr, 0,
-						    dest->addr.ip, 0,
+				ct = ip_vs_conn_new(AF_INET, iph.protocol,
+						    &snet, 0,
+						    &iph.daddr, 0,
+						    &dest->addr, 0,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			if (ct == NULL)
@@ -258,12 +260,16 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
 		 */
-		if (svc->fwmark)
-			ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
-					       htonl(svc->fwmark), 0);
-		else
-			ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
-					       iph->daddr, 0);
+		if (svc->fwmark) {
+			union nf_inet_addr fwmark = {
+				.all = { 0, 0, 0, htonl(svc->fwmark) }
+			};
+
+			ct = ip_vs_ct_in_get(AF_INET, IPPROTO_IP, &snet, 0,
+					     &fwmark, 0);
+		} else
+			ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0,
+					     &iph.daddr, 0);
 
 		if (!ct || !ip_vs_check_template(ct)) {
 			/*
@@ -282,18 +288,22 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 			/*
 			 * Create a template according to the service
 			 */
-			if (svc->fwmark)
-				ct = ip_vs_conn_new(IPPROTO_IP,
-						    snet, 0,
-						    htonl(svc->fwmark), 0,
-						    dest->addr.ip, 0,
+			if (svc->fwmark) {
+				union nf_inet_addr fwmark = {
+					.all = { 0, 0, 0, htonl(svc->fwmark) }
+				};
+
+				ct = ip_vs_conn_new(AF_INET, IPPROTO_IP,
+						    &snet, 0,
+						    &fwmark, 0,
+						    &dest->addr, 0,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
-			else
-				ct = ip_vs_conn_new(iph->protocol,
-						    snet, 0,
-						    iph->daddr, 0,
-						    dest->addr.ip, 0,
+			} else
+				ct = ip_vs_conn_new(AF_INET, iph.protocol,
+						    &snet, 0,
+						    &iph.daddr, 0,
+						    &dest->addr, 0,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			if (ct == NULL)
@@ -310,10 +320,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	/*
 	 *    Create a new connection according to the template
 	 */
-	cp = ip_vs_conn_new(iph->protocol,
-			    iph->saddr, ports[0],
-			    iph->daddr, ports[1],
-			    dest->addr.ip, dport,
+	cp = ip_vs_conn_new(AF_INET, iph.protocol,
+			    &iph.saddr, ports[0],
+			    &iph.daddr, ports[1],
+			    &dest->addr, dport,
 			    0,
 			    dest);
 	if (cp == NULL) {
@@ -342,12 +352,12 @@ struct ip_vs_conn *
 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
 	struct ip_vs_conn *cp = NULL;
-	struct iphdr *iph = ip_hdr(skb);
+	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	__be16 _ports[2], *pptr;
 
-	pptr = skb_header_pointer(skb, iph->ihl*4,
-				  sizeof(_ports), _ports);
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return NULL;
 
@@ -377,10 +387,10 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	/*
 	 *    Create a connection entry.
 	 */
-	cp = ip_vs_conn_new(iph->protocol,
-			    iph->saddr, pptr[0],
-			    iph->daddr, pptr[1],
-			    dest->addr.ip, dest->port ? dest->port : pptr[1],
+	cp = ip_vs_conn_new(AF_INET, iph.protocol,
+			    &iph.saddr, pptr[0],
+			    &iph.daddr, pptr[1],
+			    &dest->addr, dest->port ? dest->port : pptr[1],
 			    0,
 			    dest);
 	if (cp == NULL)
@@ -408,10 +418,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		struct ip_vs_protocol *pp)
 {
 	__be16 _ports[2], *pptr;
-	struct iphdr *iph = ip_hdr(skb);
+	struct ip_vs_iphdr iph;
+	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
 
-	pptr = skb_header_pointer(skb, iph->ihl*4,
-				  sizeof(_ports), _ports);
+	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 	if (pptr == NULL) {
 		ip_vs_service_put(svc);
 		return NF_DROP;
@@ -421,7 +431,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	   and the destination is RTN_UNICAST (and not local), then create
 	   a cache_bypass connection entry */
 	if (sysctl_ip_vs_cache_bypass && svc->fwmark
-	    && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) {
+	    && (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST)) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
 
@@ -429,9 +439,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 
 		/* create a new connection entry */
 		IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
-		cp = ip_vs_conn_new(iph->protocol,
-				    iph->saddr, pptr[0],
-				    iph->daddr, pptr[1],
+		cp = ip_vs_conn_new(AF_INET, iph.protocol,
+				    &iph.saddr, pptr[0],
+				    &iph.daddr, pptr[1],
 				    0, 0,
 				    IP_VS_CONN_F_BYPASS,
 				    NULL);
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
index bfe5d7050a5..0c3fbe0de5f 100644
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -140,7 +140,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	struct tcphdr *th;
 	char *data, *data_limit;
 	char *start, *end;
-	__be32 from;
+	union nf_inet_addr from;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
 	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
@@ -166,24 +166,25 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		if (ip_vs_ftp_get_addrport(data, data_limit,
 					   SERVER_STRING,
 					   sizeof(SERVER_STRING)-1, ')',
-					   &from, &port,
+					   &from.ip, &port,
 					   &start, &end) != 1)
 			return 1;
 
 		IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
 			  "%u.%u.%u.%u:%d detected\n",
-			  NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr.ip), 0);
+			  NIPQUAD(from.ip), ntohs(port),
+			  NIPQUAD(cp->caddr.ip), 0);
 
 		/*
 		 * Now update or create an connection entry for it
 		 */
-		n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
-					  cp->caddr.ip, 0);
+		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
+					  &cp->caddr, 0);
 		if (!n_cp) {
-			n_cp = ip_vs_conn_new(IPPROTO_TCP,
-					      cp->caddr.ip, 0,
-					      cp->vaddr.ip, port,
-					      from, port,
+			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+					      &cp->caddr, 0,
+					      &cp->vaddr, port,
+					      &from, port,
 					      IP_VS_CONN_F_NO_CPORT,
 					      cp->dest);
 			if (!n_cp)
@@ -196,9 +197,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		/*
 		 * Replace the old passive address with the new one
 		 */
-		from = n_cp->vaddr.ip;
+		from.ip = n_cp->vaddr.ip;
 		port = n_cp->vport;
-		sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
+		sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
 			(ntohs(port)>>8)&255, ntohs(port)&255);
 		buf_len = strlen(buf);
 
@@ -243,7 +244,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	struct tcphdr *th;
 	char *data, *data_start, *data_limit;
 	char *start, *end;
-	__be32 to;
+	union nf_inet_addr to;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
 
@@ -291,12 +292,12 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	 */
 	if (ip_vs_ftp_get_addrport(data_start, data_limit,
 				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
-				   '\r', &to, &port,
+				   '\r', &to.ip, &port,
 				   &start, &end) != 1)
 		return 1;
 
 	IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
-		  NIPQUAD(to), ntohs(port));
+		  NIPQUAD(to.ip), ntohs(port));
 
 	/* Passive mode off */
 	cp->app_data = NULL;
@@ -306,16 +307,16 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	 */
 	IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
 		  ip_vs_proto_name(iph->protocol),
-		  NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
+		  NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
 
-	n_cp = ip_vs_conn_in_get(iph->protocol,
-				 to, port,
-				 cp->vaddr.ip, htons(ntohs(cp->vport)-1));
+	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
+				 &to, port,
+				 &cp->vaddr, htons(ntohs(cp->vport)-1));
 	if (!n_cp) {
-		n_cp = ip_vs_conn_new(IPPROTO_TCP,
-				      to, port,
-				      cp->vaddr.ip, htons(ntohs(cp->vport)-1),
-				      cp->daddr.ip, htons(ntohs(cp->dport)-1),
+		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+				      &to, port,
+				      &cp->vaddr, htons(ntohs(cp->vport)-1),
+				      &cp->daddr, htons(ntohs(cp->dport)-1),
 				      0,
 				      cp->dest);
 		if (!n_cp)
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
index 4b0b8f268d1..2b18a78d039 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
@@ -46,16 +46,16 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 	struct ip_vs_conn *cp;
 
 	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->saddr.ip,
+		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+				       &iph->saddr,
 				       htons(PORT_ISAKMP),
-				       iph->daddr.ip,
+				       &iph->daddr,
 				       htons(PORT_ISAKMP));
 	} else {
-		cp = ip_vs_conn_in_get(IPPROTO_UDP,
-				       iph->daddr.ip,
+		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+				       &iph->daddr,
 				       htons(PORT_ISAKMP),
-				       iph->saddr.ip,
+				       &iph->saddr,
 				       htons(PORT_ISAKMP));
 	}
 
@@ -86,16 +86,16 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 	struct ip_vs_conn *cp;
 
 	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->saddr.ip,
+		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+					&iph->saddr,
 					htons(PORT_ISAKMP),
-					iph->daddr.ip,
+					&iph->daddr,
 					htons(PORT_ISAKMP));
 	} else {
-		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-					iph->daddr.ip,
+		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+					&iph->daddr,
 					htons(PORT_ISAKMP),
-					iph->saddr.ip,
+					&iph->saddr,
 					htons(PORT_ISAKMP));
 	}
 
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 3daae43ae44..3da2bb05ee7 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -36,13 +36,13 @@ tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return NULL;
 
 	if (likely(!inverse)) {
-		return ip_vs_conn_in_get(iph->protocol,
-					 iph->saddr.ip, pptr[0],
-					 iph->daddr.ip, pptr[1]);
+		return ip_vs_conn_in_get(af, iph->protocol,
+					 &iph->saddr, pptr[0],
+					 &iph->daddr, pptr[1]);
 	} else {
-		return ip_vs_conn_in_get(iph->protocol,
-					 iph->daddr.ip, pptr[1],
-					 iph->saddr.ip, pptr[0]);
+		return ip_vs_conn_in_get(af, iph->protocol,
+					 &iph->daddr, pptr[1],
+					 &iph->saddr, pptr[0]);
 	}
 }
 
@@ -58,13 +58,13 @@ tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return NULL;
 
 	if (likely(!inverse)) {
-		return ip_vs_conn_out_get(iph->protocol,
-					  iph->saddr.ip, pptr[0],
-					  iph->daddr.ip, pptr[1]);
+		return ip_vs_conn_out_get(af, iph->protocol,
+					  &iph->saddr, pptr[0],
+					  &iph->daddr, pptr[1]);
 	} else {
-		return ip_vs_conn_out_get(iph->protocol,
-					  iph->daddr.ip, pptr[1],
-					  iph->saddr.ip, pptr[0]);
+		return ip_vs_conn_out_get(af, iph->protocol,
+					  &iph->daddr, pptr[1],
+					  &iph->saddr, pptr[0]);
 	}
 }
 
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 6cca0ad8e32..fd8bd934cc0 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -36,13 +36,13 @@ udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return NULL;
 
 	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(iph->protocol,
-				       iph->saddr.ip, pptr[0],
-				       iph->daddr.ip, pptr[1]);
+		cp = ip_vs_conn_in_get(af, iph->protocol,
+				       &iph->saddr, pptr[0],
+				       &iph->daddr, pptr[1]);
 	} else {
-		cp = ip_vs_conn_in_get(iph->protocol,
-				       iph->daddr.ip, pptr[1],
-				       iph->saddr.ip, pptr[0]);
+		cp = ip_vs_conn_in_get(af, iph->protocol,
+				       &iph->daddr, pptr[1],
+				       &iph->saddr, pptr[0]);
 	}
 
 	return cp;
@@ -62,13 +62,13 @@ udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return NULL;
 
 	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(iph->protocol,
-					iph->saddr.ip, pptr[0],
-					iph->daddr.ip, pptr[1]);
+		cp = ip_vs_conn_out_get(af, iph->protocol,
+					&iph->saddr, pptr[0],
+					&iph->daddr, pptr[1]);
 	} else {
-		cp = ip_vs_conn_out_get(iph->protocol,
-					iph->daddr.ip, pptr[1],
-					iph->saddr.ip, pptr[0]);
+		cp = ip_vs_conn_out_get(af, iph->protocol,
+					&iph->daddr, pptr[1],
+					&iph->saddr, pptr[0]);
 	}
 
 	return cp;
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2cf47b2e166..3ce1093e067 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -366,13 +366,17 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 		}
 
 		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(s->protocol,
-					       s->caddr, s->cport,
-					       s->vaddr, s->vport);
+			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
+					       (union nf_inet_addr *)&s->caddr,
+					       s->cport,
+					       (union nf_inet_addr *)&s->vaddr,
+					       s->vport);
 		else
-			cp = ip_vs_ct_in_get(s->protocol,
-					       s->caddr, s->cport,
-					       s->vaddr, s->vport);
+			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
+					     (union nf_inet_addr *)&s->caddr,
+					     s->cport,
+					     (union nf_inet_addr *)&s->vaddr,
+					     s->vport);
 		if (!cp) {
 			/*
 			 * Find the appropriate destination for the connection.
@@ -389,10 +393,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 				else
 					flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
-			cp = ip_vs_conn_new(s->protocol,
-					    s->caddr, s->cport,
-					    s->vaddr, s->vport,
-					    s->daddr, s->dport,
+			cp = ip_vs_conn_new(AF_INET, s->protocol,
+					    (union nf_inet_addr *)s->caddr,
+					    s->cport,
+					    (union nf_inet_addr *)s->vaddr,
+					    s->vport,
+					    (union nf_inet_addr *)s->daddr,
+					    s->dport,
 					    flags, dest);
 			if (dest)
 				atomic_dec(&dest->refcnt);
-- 
cgit v1.2.3


From 38cdcc9a039b92a9972dca3c954fb3d8b3ef13bf Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:44 +0200
Subject: IPVS: Add IPv6 support to xmit() support functions

Add IPv6 support to IP_VS_XMIT() and to the xmit routing cache, introducing
a new function __ip_vs_get_out_rt_v6().

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_xmit.c | 82 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 75 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index 88199c9f2d3..fd8342ec7e1 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -20,6 +20,9 @@
 #include <net/udp.h>
 #include <net/icmp.h>                   /* for icmp_send */
 #include <net/route.h>                  /* for ip_route_output */
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 
@@ -47,7 +50,8 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
 
 	if (!dst)
 		return NULL;
-	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	if ((dst->obsolete
+	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
 	    dst->ops->check(dst, cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
@@ -109,6 +113,70 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 	return rt;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static struct rt6_info *
+__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	struct ip_vs_dest *dest = cp->dest;
+
+	if (dest) {
+		spin_lock(&dest->dst_lock);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		if (!rt) {
+			struct flowi fl = {
+				.oif = 0,
+				.nl_u = {
+					.ip6_u = {
+						.daddr = dest->addr.in6,
+						.saddr = {
+							.s6_addr32 =
+								{ 0, 0, 0, 0 },
+						},
+					},
+				},
+			};
+
+			rt = (struct rt6_info *)ip6_route_output(&init_net,
+								 NULL, &fl);
+			if (!rt) {
+				spin_unlock(&dest->dst_lock);
+				IP_VS_DBG_RL("ip6_route_output error, "
+					     "dest: " NIP6_FMT "\n",
+					     NIP6(dest->addr.in6));
+				return NULL;
+			}
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
+			IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
+				  NIP6(dest->addr.in6),
+				  atomic_read(&rt->u.dst.__refcnt));
+		}
+		spin_unlock(&dest->dst_lock);
+	} else {
+		struct flowi fl = {
+			.oif = 0,
+			.nl_u = {
+				.ip6_u = {
+					.daddr = cp->daddr.in6,
+					.saddr = {
+						.s6_addr32 = { 0, 0, 0, 0 },
+					},
+				},
+			},
+		};
+
+		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+		if (!rt) {
+			IP_VS_DBG_RL("ip6_route_output error, dest: "
+				     NIP6_FMT "\n", NIP6(cp->daddr.in6));
+			return NULL;
+		}
+	}
+
+	return rt;
+}
+#endif
+
 
 /*
  *	Release dest->dst_cache before a dest is removed
@@ -123,11 +191,11 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 	dst_release(old_dst);
 }
 
-#define IP_VS_XMIT(skb, rt)				\
+#define IP_VS_XMIT(pf, skb, rt)				\
 do {							\
 	(skb)->ipvs_property = 1;			\
 	skb_forward_csum(skb);				\
-	NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL,	\
+	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
 		(rt)->u.dst.dev, dst_output);		\
 } while (0)
 
@@ -200,7 +268,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(skb, rt);
+	IP_VS_XMIT(PF_INET, skb, rt);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -276,7 +344,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(skb, rt);
+	IP_VS_XMIT(PF_INET, skb, rt);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -467,7 +535,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(skb, rt);
+	IP_VS_XMIT(PF_INET, skb, rt);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -540,7 +608,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(skb, rt);
+	IP_VS_XMIT(PF_INET, skb, rt);
 
 	rc = NF_STOLEN;
 	goto out;
-- 
cgit v1.2.3


From b3cdd2a73867d309dca288b8e820c09e3b7f1da1 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:45 +0200
Subject: IPVS: Add and bind IPv6 xmit functions

Add xmit functions for IPv6. Also add the already needed __ip_vs_get_out_rt_v6()
to ip_vs_core.c. Bind the new xmit functions to v6 connections.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_conn.c |  34 +++-
 net/ipv4/ipvs/ip_vs_core.c |  43 ++++++
 net/ipv4/ipvs/ip_vs_xmit.c | 377 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 453 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 15eec282b17..f5dddad6d5e 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -389,6 +389,33 @@ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
 	}
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
+{
+	switch (IP_VS_FWD_METHOD(cp)) {
+	case IP_VS_CONN_F_MASQ:
+		cp->packet_xmit = ip_vs_nat_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_TUNNEL:
+		cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_DROUTE:
+		cp->packet_xmit = ip_vs_dr_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_LOCALNODE:
+		cp->packet_xmit = ip_vs_null_xmit;
+		break;
+
+	case IP_VS_CONN_F_BYPASS:
+		cp->packet_xmit = ip_vs_bypass_xmit_v6;
+		break;
+	}
+}
+#endif
+
 
 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
 {
@@ -694,7 +721,12 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 	cp->timeout = 3*HZ;
 
 	/* Bind its packet transmitter */
-	ip_vs_bind_xmit(cp);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_bind_xmit_v6(cp);
+	else
+#endif
+		ip_vs_bind_xmit(cp);
 
 	if (unlikely(pp && atomic_read(&pp->appcnt)))
 		ip_vs_bind_app(cp, pp);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 2d5a4331709..d6f5bf9049a 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -570,6 +570,49 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			"Forwarding altered incoming ICMP");
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    struct ip_vs_conn *cp, int inout)
+{
+	struct ipv6hdr *iph	 = ipv6_hdr(skb);
+	unsigned int icmp_offset = sizeof(struct ipv6hdr);
+	struct icmp6hdr *icmph	 = (struct icmp6hdr *)(skb_network_header(skb) +
+						      icmp_offset);
+	struct ipv6hdr *ciph	 = (struct ipv6hdr *)(icmph + 1);
+
+	if (inout) {
+		iph->saddr = cp->vaddr.in6;
+		ciph->daddr = cp->vaddr.in6;
+	} else {
+		iph->daddr = cp->daddr.in6;
+		ciph->saddr = cp->daddr.in6;
+	}
+
+	/* the TCP/UDP port */
+	if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
+		__be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
+
+		if (inout)
+			ports[1] = cp->vport;
+		else
+			ports[0] = cp->dport;
+	}
+
+	/* And finally the ICMP checksum */
+	icmph->icmp6_cksum = 0;
+	/* TODO IPv6: is this correct for ICMPv6? */
+	ip_vs_checksum_complete(skb, icmp_offset);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (inout)
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered outgoing ICMPv6");
+	else
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered incoming ICMPv6");
+}
+#endif
+
 /*
  *	Handle ICMP messages in the inside-to-outside direction (outgoing).
  *	Find any that might be relevant, check against existing connections,
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index fd8342ec7e1..02ddc2b3ce2 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -281,6 +281,70 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	return NF_STOLEN;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		     struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	struct ipv6hdr  *iph = ipv6_hdr(skb);
+	int    mtu;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = iph->daddr,
+				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
+	};
+
+	EnterFunction(10);
+
+	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+	if (!rt) {
+		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
+			     "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
+		goto tx_error_icmp;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		dst_release(&rt->u.dst);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(skb == NULL)) {
+		dst_release(&rt->u.dst);
+		return NF_STOLEN;
+	}
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+ tx_error_icmp:
+	dst_link_failure(skb);
+ tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
 
 /*
  *      NAT transmitter (only for outside-to-inside nat forwarding)
@@ -360,6 +424,83 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	goto tx_error;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;		/* Route to the other host */
+	int mtu;
+
+	EnterFunction(10);
+
+	/* check if it is a connection of no-client-port */
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		__be16 _pt, *p;
+		p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
+				       sizeof(_pt), &_pt);
+		if (p == NULL)
+			goto tx_error;
+		ip_vs_conn_fill_cport(cp, *p);
+		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+	}
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		dst_release(&rt->u.dst);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				 "ip_vs_nat_xmit_v6(): frag needed for");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* mangle the packet */
+	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+		goto tx_error;
+	ipv6_hdr(skb)->daddr = cp->daddr.in6;
+
+	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+
+	/* FIXME: when application helper enlarges the packet and the length
+	   is larger than the MTU of outgoing device, there will be still
+	   MTU problem. */
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	LeaveFunction(10);
+	kfree_skb(skb);
+	return NF_STOLEN;
+tx_error_put:
+	dst_release(&rt->u.dst);
+	goto tx_error;
+}
+#endif
+
 
 /*
  *   IP Tunneling transmitter
@@ -491,6 +632,112 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	return NF_STOLEN;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		     struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;		/* Route to the other host */
+	struct net_device *tdev;	/* Device to other host */
+	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
+	sk_buff_data_t old_transport_header = skb->transport_header;
+	struct ipv6hdr  *iph;		/* Our new IP header */
+	unsigned int max_headroom;	/* The extra header space needed */
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (skb->protocol != htons(ETH_P_IPV6)) {
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
+			     "ETH_P_IPV6: %d, skb protocol: %d\n",
+			     htons(ETH_P_IPV6), skb->protocol);
+		goto tx_error;
+	}
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	tdev = rt->u.dst.dev;
+
+	mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
+	/* TODO IPv6: do we need this check in IPv6? */
+	if (mtu < 1280) {
+		dst_release(&rt->u.dst);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
+		goto tx_error;
+	}
+	if (skb->dst)
+		skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		dst_release(&rt->u.dst);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+
+	if (skb_headroom(skb) < max_headroom
+	    || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			dst_release(&rt->u.dst);
+			kfree_skb(skb);
+			IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
+			return NF_STOLEN;
+		}
+		kfree_skb(skb);
+		skb = new_skb;
+		old_iph = ipv6_hdr(skb);
+	}
+
+	skb->transport_header = old_transport_header;
+
+	skb_push(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+	iph			=	ipv6_hdr(skb);
+	iph->version		=	6;
+	iph->nexthdr		=	IPPROTO_IPV6;
+	iph->payload_len	=	old_iph->payload_len + sizeof(old_iph);
+	iph->priority		=	old_iph->priority;
+	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
+	iph->daddr		=	rt->rt6i_dst.addr;
+	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	iph->hop_limit		=	old_iph->hop_limit;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	ip6_local_out(skb);
+
+	LeaveFunction(10);
+
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
+
 
 /*
  *      Direct Routing transmitter
@@ -548,6 +795,60 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	return NF_STOLEN;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		 struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	int    mtu;
+
+	EnterFunction(10);
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		dst_release(&rt->u.dst);
+		IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(skb == NULL)) {
+		dst_release(&rt->u.dst);
+		return NF_STOLEN;
+	}
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
+
 
 /*
  *	ICMP packet transmitter
@@ -625,3 +926,79 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_rt_put(rt);
 	goto tx_error;
 }
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp, int offset)
+{
+	struct rt6_info	*rt;	/* Route to the other host */
+	int mtu;
+	int rc;
+
+	EnterFunction(10);
+
+	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+	   forwarded directly here, because there is no need to
+	   translate address/port back */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+		if (cp->packet_xmit)
+			rc = cp->packet_xmit(skb, cp, pp);
+		else
+			rc = NF_ACCEPT;
+		/* do not touch skb anymore */
+		atomic_inc(&cp->in_pkts);
+		goto out;
+	}
+
+	/*
+	 * mangle and send the packet here (only for VS/NAT)
+	 */
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		dst_release(&rt->u.dst);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, offset))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop the old route when skb is not shared */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	rc = NF_STOLEN;
+	goto out;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	dev_kfree_skb(skb);
+	rc = NF_STOLEN;
+out:
+	LeaveFunction(10);
+	return rc;
+tx_error_put:
+	dst_release(&rt->u.dst);
+	goto tx_error;
+}
+#endif
-- 
cgit v1.2.3


From cd17f9ed099ed27e9b0d298253e5c05e335ac656 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:46 +0200
Subject: IPVS: Extend scheduling functions for IPv6 support

Convert ip_vs_schedule() and ip_vs_sched_persist() to support scheduling of
IPv6 connections.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 56 +++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index d6f5bf9049a..8bfd7c2f0eb 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -176,19 +176,25 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
-	__be16  dport;	 /* destination port to forward */
+	__be16  dport;			/* destination port to forward */
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
-	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
 	/* Mask saddr with the netmask to adjust template granularity */
-	snet.ip = iph.saddr.ip & svc->netmask;
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
+	else
+#endif
+		snet.ip = iph.saddr.ip & svc->netmask;
 
-	IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
-		  "mnet %u.%u.%u.%u\n",
-		  NIPQUAD(iph.saddr.ip), ntohs(ports[0]),
-		  NIPQUAD(iph.daddr.ip), ntohs(ports[1]),
-		  NIPQUAD(snet));
+	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
+		      "mnet %s\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
+		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+		      IP_VS_DBG_ADDR(svc->af, &snet));
 
 	/*
 	 * As far as we know, FTP is a very complicated network protocol, and
@@ -206,10 +212,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	if (ports[1] == svc->port) {
 		/* Check if a template already exists */
 		if (svc->port != FTPPORT)
-			ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0,
+			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 					     &iph.daddr, ports[1]);
 		else
-			ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0,
+			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 					     &iph.daddr, 0);
 
 		if (!ct || !ip_vs_check_template(ct)) {
@@ -230,7 +236,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 			 * for ftp service.
 			 */
 			if (svc->port != FTPPORT)
-				ct = ip_vs_conn_new(AF_INET, iph.protocol,
+				ct = ip_vs_conn_new(svc->af, iph.protocol,
 						    &snet, 0,
 						    &iph.daddr,
 						    ports[1],
@@ -238,7 +244,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			else
-				ct = ip_vs_conn_new(AF_INET, iph.protocol,
+				ct = ip_vs_conn_new(svc->af, iph.protocol,
 						    &snet, 0,
 						    &iph.daddr, 0,
 						    &dest->addr, 0,
@@ -265,10 +271,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				.all = { 0, 0, 0, htonl(svc->fwmark) }
 			};
 
-			ct = ip_vs_ct_in_get(AF_INET, IPPROTO_IP, &snet, 0,
+			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
 					     &fwmark, 0);
 		} else
-			ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0,
+			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 					     &iph.daddr, 0);
 
 		if (!ct || !ip_vs_check_template(ct)) {
@@ -293,14 +299,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 					.all = { 0, 0, 0, htonl(svc->fwmark) }
 				};
 
-				ct = ip_vs_conn_new(AF_INET, IPPROTO_IP,
+				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
 						    &snet, 0,
 						    &fwmark, 0,
 						    &dest->addr, 0,
 						    IP_VS_CONN_F_TEMPLATE,
 						    dest);
 			} else
-				ct = ip_vs_conn_new(AF_INET, iph.protocol,
+				ct = ip_vs_conn_new(svc->af, iph.protocol,
 						    &snet, 0,
 						    &iph.daddr, 0,
 						    &dest->addr, 0,
@@ -320,7 +326,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	/*
 	 *    Create a new connection according to the template
 	 */
-	cp = ip_vs_conn_new(AF_INET, iph.protocol,
+	cp = ip_vs_conn_new(svc->af, iph.protocol,
 			    &iph.saddr, ports[0],
 			    &iph.daddr, ports[1],
 			    &dest->addr, dport,
@@ -387,7 +393,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	/*
 	 *    Create a connection entry.
 	 */
-	cp = ip_vs_conn_new(AF_INET, iph.protocol,
+	cp = ip_vs_conn_new(svc->af, iph.protocol,
 			    &iph.saddr, pptr[0],
 			    &iph.daddr, pptr[1],
 			    &dest->addr, dest->port ? dest->port : pptr[1],
@@ -396,13 +402,13 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	if (cp == NULL)
 		return NULL;
 
-	IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
-		  "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
-		  ip_vs_fwd_tag(cp),
-		  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
-		  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
-		  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
-		  cp->flags, atomic_read(&cp->refcnt));
+	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
+		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+		      ip_vs_fwd_tag(cp),
+		      IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
+		      cp->flags, atomic_read(&cp->refcnt));
 
 	ip_vs_conn_stats(cp, svc);
 	return cp;
-- 
cgit v1.2.3


From 2a3b791e6e1169f374224d164738e9f7be703d77 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:47 +0200
Subject: IPVS: Add/adjust Netfilter hook functions and helpers for v6

Add Netfilter hook functions or modify existing ones, if possible, to
process IPv6 packets. Some support functions are also added/modified for
this. ip_vs_nat_icmp_v6() was already added in the patch that added the v6
xmit functions, as it is called from one of them.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 365 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 329 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 8bfd7c2f0eb..035a511e12f 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -39,6 +39,11 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#endif
+
 #include <net/ip_vs.h>
 
 
@@ -60,6 +65,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
 
 /* ID used in ICMP lookups */
 #define icmp_id(icmph)          (((icmph)->un).echo.id)
+#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
 
 const char *ip_vs_proto_name(unsigned proto)
 {
@@ -74,6 +80,10 @@ const char *ip_vs_proto_name(unsigned proto)
 		return "TCP";
 	case IPPROTO_ICMP:
 		return "ICMP";
+#ifdef CONFIG_IP_VS_IPV6
+	case IPPROTO_ICMPV6:
+		return "ICMPv6";
+#endif
 	default:
 		sprintf(buf, "IP_%d", proto);
 		return buf;
@@ -425,7 +435,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 {
 	__be16 _ports[2], *pptr;
 	struct ip_vs_iphdr iph;
-	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+	int unicast;
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 	if (pptr == NULL) {
@@ -433,11 +444,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		return NF_DROP;
 	}
 
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
+	else
+#endif
+		unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+
 	/* if it is fwmark-based service, the cache_bypass sysctl is up
-	   and the destination is RTN_UNICAST (and not local), then create
+	   and the destination is a non-local unicast, then create
 	   a cache_bypass connection entry */
-	if (sysctl_ip_vs_cache_bypass && svc->fwmark
-	    && (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST)) {
+	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
 
@@ -445,7 +462,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 
 		/* create a new connection entry */
 		IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
-		cp = ip_vs_conn_new(AF_INET, iph.protocol,
+		cp = ip_vs_conn_new(svc->af, iph.protocol,
 				    &iph.saddr, pptr[0],
 				    &iph.daddr, pptr[1],
 				    0, 0,
@@ -489,7 +506,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	 * created, the TCP RST packet cannot be sent, instead that
 	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 	 */
-	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
+			    skb->dev);
+	else
+#endif
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
 	return NF_DROP;
 }
 
@@ -528,6 +552,14 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 	return err;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
+{
+	/* TODO IPv6: Find out what to do here for IPv6 */
+	return 0;
+}
+#endif
+
 /*
  * Packet has been made sufficiently writable in caller
  * - inout: 1=in->out, 0=out->in
@@ -727,11 +759,117 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 	return verdict;
 }
 
-static inline int is_tcp_reset(const struct sk_buff *skb)
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+{
+	struct ipv6hdr *iph;
+	struct icmp6hdr	_icmph, *ic;
+	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
+					   within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, verdict;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+		if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+			return NF_STOLEN;
+	}
+
+	iph = ipv6_hdr(skb);
+	offset = sizeof(struct ipv6hdr);
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
+		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
+		  NIP6(iph->saddr), NIP6(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->nexthdr);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	/* TODO: we don't support fragmentation at the moment anyways */
+	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+
+	offset += sizeof(struct ipv6hdr);
+
+	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	verdict = NF_DROP;
+
+	if (IP_VS_FWD_METHOD(cp) != 0) {
+		IP_VS_ERR("shouldn't reach here, because the box is on the "
+			  "half connection in the tun/dr module.\n");
+	}
+
+	/* Ensure the checksum is correct */
+	if (!skb_csum_unnecessary(skb)
+	    && ip_vs_checksum_complete(skb, sizeof(struct ipv6hdr))) {
+		/* Failed checksum! */
+		IP_VS_DBG(1, "Forward ICMPv6: failed checksum from "
+			  NIP6_FMT "!\n",
+			  NIP6(iph->saddr));
+		goto out;
+	}
+
+	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
+		offset += 2 * sizeof(__u16);
+	if (!skb_make_writable(skb, offset))
+		goto out;
+
+	ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+
+	/* do the statistics and put it back */
+	ip_vs_out_stats(cp, skb);
+
+	skb->ipvs_property = 1;
+	verdict = NF_ACCEPT;
+
+out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+#endif
+
+static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
 {
 	struct tcphdr _tcph, *th;
 
-	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
+	th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
 	if (th == NULL)
 		return 0;
 	return th->rst;
@@ -750,38 +888,64 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
+	int af;
 
 	EnterFunction(11);
 
+	af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
-	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
-	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-		int related, verdict = ip_vs_out_icmp(skb, &related);
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 
-		if (related)
-			return verdict;
-		ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
-	}
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+	} else
+#endif
+		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+			int related, verdict = ip_vs_out_icmp(skb, &related);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
 
 	pp = ip_vs_proto_get(iph.protocol);
 	if (unlikely(!pp))
 		return NF_ACCEPT;
 
 	/* reassemble IP fragments */
-	if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
-		     !pp->dont_defrag)) {
-		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
-			return NF_STOLEN;
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 
-		ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
-	}
+			if (related)
+				return verdict;
+
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+	} else
+#endif
+		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
+			     !pp->dont_defrag)) {
+			if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+				return NF_STOLEN;
+
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
 
 	/*
 	 * Check if the packet belongs to an existing entry
 	 */
-	cp = pp->conn_out_get(AF_INET, skb, pp, &iph, iph.len, 0);
+	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
 	if (unlikely(!cp)) {
 		if (sysctl_ip_vs_nat_icmp_send &&
@@ -794,16 +958,26 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 			if (pptr == NULL)
 				return NF_ACCEPT;	/* Not for me */
 			if (ip_vs_lookup_real_service(iph.protocol,
-						      iph.saddr.ip, pptr[0])) {
+						      iph.saddr.ip,
+						      pptr[0])) {
 				/*
 				 * Notify the real server: there is no
 				 * existing entry if it is not RST
 				 * packet or not TCP packet.
 				 */
 				if (iph.protocol != IPPROTO_TCP
-				    || !is_tcp_reset(skb)) {
-					icmp_send(skb,ICMP_DEST_UNREACH,
-						  ICMP_PORT_UNREACH, 0);
+				    || !is_tcp_reset(skb, iph.len)) {
+#ifdef CONFIG_IP_VS_IPV6
+					if (af == AF_INET6)
+						icmpv6_send(skb,
+							    ICMPV6_DEST_UNREACH,
+							    ICMPV6_PORT_UNREACH,
+							    0, skb->dev);
+					else
+#endif
+						icmp_send(skb,
+							  ICMP_DEST_UNREACH,
+							  ICMP_PORT_UNREACH, 0);
 					return NF_DROP;
 				}
 			}
@@ -821,8 +995,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	/* mangle the packet */
 	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 		goto drop;
-	ip_hdr(skb)->saddr = cp->vaddr.ip;
-	ip_send_check(ip_hdr(skb));
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+	else
+#endif
+	{
+		ip_hdr(skb)->saddr = cp->vaddr.ip;
+		ip_send_check(ip_hdr(skb));
+	}
 
 	/* For policy routing, packets originating from this
 	 * machine itself may be routed differently to packets
@@ -830,8 +1012,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	 * if it came from this machine itself.  So re-compute
 	 * the routing information.
 	 */
-	if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-		goto drop;
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (ip6_route_me_harder(skb) != 0)
+			goto drop;
+	} else
+#endif
+		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			goto drop;
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 
@@ -949,6 +1137,94 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	return verdict;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static int
+ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+	struct ipv6hdr *iph;
+	struct icmp6hdr	_icmph, *ic;
+	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
+					   within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, verdict;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+		if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
+					       IP_DEFRAG_VS_IN :
+					       IP_DEFRAG_VS_FWD))
+			return NF_STOLEN;
+	}
+
+	iph = ipv6_hdr(skb);
+	offset = sizeof(struct ipv6hdr);
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
+		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
+		  NIP6(iph->saddr), NIP6(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->nexthdr);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	/* TODO: we don't support fragmentation at the moment anyways */
+	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+
+	offset += sizeof(struct ipv6hdr);
+
+	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	verdict = NF_DROP;
+
+	/* do the statistics and put it back */
+	ip_vs_in_stats(cp, skb);
+	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
+		offset += 2 * sizeof(__u16);
+	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
+	/* do not touch skb anymore */
+
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+#endif
+
+
 /*
  *	Check if it's for virtual services, look it up,
  *	and send it on its way...
@@ -961,9 +1237,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int ret, restart;
+	int ret, restart, af;
+
+	af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 
-	ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 
 	/*
 	 *	Big tappo: only PACKET_HOST (neither loopback nor mcasts)
@@ -974,7 +1252,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
 			      skb->pkt_type,
 			      iph.protocol,
-			      IP_VS_DBG_ADDR(AF_INET, &iph.daddr));
+			      IP_VS_DBG_ADDR(af, &iph.daddr));
 		return NF_ACCEPT;
 	}
 
@@ -983,7 +1261,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 
 		if (related)
 			return verdict;
-		ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph);
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 	}
 
 	/* Protocol supported? */
@@ -994,12 +1272,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	/*
 	 * Check if the packet belongs to an existing connection entry
 	 */
-	cp = pp->conn_in_get(AF_INET, skb, pp, &iph, iph.len, 0);
+	cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
 
 	if (unlikely(!cp)) {
 		int v;
 
-		if (!pp->conn_schedule(AF_INET, skb, pp, &v, &cp))
+		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
 			return v;
 	}
 
@@ -1082,6 +1360,21 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
 	return ip_vs_in_icmp(skb, &r, hooknum);
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static unsigned int
+ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in, const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	int r;
+
+	if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+		return NF_ACCEPT;
+
+	return ip_vs_in_icmp_v6(skb, &r, hooknum);
+}
+#endif
+
 
 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
-- 
cgit v1.2.3


From 7937df1564783806c285d34a1c6fd63d8da29d7a Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:48 +0200
Subject: IPVS: Convert real server lookup functions

Convert functions for looking up destinations (real servers) to support
IPv6 services/dests.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_conn.c |  5 +--
 net/ipv4/ipvs/ip_vs_core.c |  4 +--
 net/ipv4/ipvs/ip_vs_ctl.c  | 80 ++++++++++++++++++++++++++++++----------------
 net/ipv4/ipvs/ip_vs_sync.c |  7 ++--
 4 files changed, 62 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index f5dddad6d5e..c2a42a62433 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -491,8 +491,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	struct ip_vs_dest *dest;
 
 	if ((cp) && (!cp->dest)) {
-		dest = ip_vs_find_dest(cp->daddr.ip, cp->dport,
-				       cp->vaddr.ip, cp->vport, cp->protocol);
+		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
+				       &cp->vaddr, cp->vport,
+				       cp->protocol);
 		ip_vs_bind_dest(cp, dest);
 		return dest;
 	} else
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 035a511e12f..27bef1d67aa 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -957,8 +957,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 						  sizeof(_ports), _ports);
 			if (pptr == NULL)
 				return NF_ACCEPT;	/* Not for me */
-			if (ip_vs_lookup_real_service(iph.protocol,
-						      iph.saddr.ip,
+			if (ip_vs_lookup_real_service(af, iph.protocol,
+						      &iph.saddr,
 						      pptr[0])) {
 				/*
 				 * Notify the real server: there is no
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 1f3fc66e694..bb0e1e3c885 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -492,11 +492,20 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 /*
  *	Returns hash value for real service
  */
-static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
+static inline unsigned ip_vs_rs_hashkey(int af,
+					    const union nf_inet_addr *addr,
+					    __be16 port)
 {
 	register unsigned porth = ntohs(port);
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
 
-	return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
+	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
 		& IP_VS_RTAB_MASK;
 }
 
@@ -516,7 +525,8 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
 	 *	Hash by proto,addr,port,
 	 *	which are the parameters of the real service.
 	 */
-	hash = ip_vs_rs_hashkey(dest->addr.ip, dest->port);
+	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+
 	list_add(&dest->d_list, &ip_vs_rtable[hash]);
 
 	return 1;
@@ -543,7 +553,9 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
  *	Lookup real service by <proto,addr,port> in the real service table.
  */
 struct ip_vs_dest *
-ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
+ip_vs_lookup_real_service(int af, __u16 protocol,
+			  const union nf_inet_addr *daddr,
+			  __be16 dport)
 {
 	unsigned hash;
 	struct ip_vs_dest *dest;
@@ -552,11 +564,12 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
 	 *	Check for "full" addressed entries
 	 *	Return the first found entry
 	 */
-	hash = ip_vs_rs_hashkey(daddr, dport);
+	hash = ip_vs_rs_hashkey(af, daddr, dport);
 
 	read_lock(&__ip_vs_rs_lock);
 	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
-		if ((dest->addr.ip == daddr)
+		if ((dest->af == af)
+		    && ip_vs_addr_equal(af, &dest->addr, daddr)
 		    && (dest->port == dport)
 		    && ((dest->protocol == protocol) ||
 			dest->vfwmark)) {
@@ -574,7 +587,8 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
  *	Lookup destination by {addr,port} in the given service
  */
 static struct ip_vs_dest *
-ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
+ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+		  __be16 dport)
 {
 	struct ip_vs_dest *dest;
 
@@ -582,7 +596,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
 	 * Find the destination for the given service
 	 */
 	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if ((dest->addr.ip == daddr) && (dest->port == dport)) {
+		if ((dest->af == svc->af)
+		    && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
+		    && (dest->port == dport)) {
 			/* HIT */
 			return dest;
 		}
@@ -601,14 +617,15 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
  * ip_vs_lookup_real_service() looked promissing, but
  * seems not working as expected.
  */
-struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
-				    __be32 vaddr, __be16 vport, __u16 protocol)
+struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+				   __be16 dport,
+				   const union nf_inet_addr *vaddr,
+				   __be16 vport, __u16 protocol)
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_service *svc;
-	union nf_inet_addr _vaddr = { .ip = vaddr };
 
-	svc = ip_vs_service_get(AF_INET, 0, protocol, &_vaddr, vport);
+	svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
 	if (!svc)
 		return NULL;
 	dest = ip_vs_lookup_dest(svc, daddr, dport);
@@ -629,7 +646,8 @@ struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
  *  scheduling.
  */
 static struct ip_vs_dest *
-ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
+ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+		     __be16 dport)
 {
 	struct ip_vs_dest *dest, *nxt;
 
@@ -637,17 +655,19 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
 	 * Find the destination in trash
 	 */
 	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
-		IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
-			  "dest->refcnt=%d\n",
-			  dest->vfwmark,
-			  NIPQUAD(dest->addr.ip), ntohs(dest->port),
-			  atomic_read(&dest->refcnt));
-		if (dest->addr.ip == daddr &&
+		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
+			      "dest->refcnt=%d\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+			      ntohs(dest->port),
+			      atomic_read(&dest->refcnt));
+		if (dest->af == svc->af &&
+		    ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
 		    dest->port == dport &&
 		    dest->vfwmark == svc->fwmark &&
 		    dest->protocol == svc->protocol &&
 		    (svc->fwmark ||
-		     (dest->vaddr.ip == svc->addr.ip &&
+		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
 		      dest->vport == svc->port))) {
 			/* HIT */
 			return dest;
@@ -657,10 +677,11 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
 		 * Try to purge the destination from trash if not referenced
 		 */
 		if (atomic_read(&dest->refcnt) == 1) {
-			IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
-				  "from trash\n",
-				  dest->vfwmark,
-				  NIPQUAD(dest->addr.ip), ntohs(dest->port));
+			IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
+				      "from trash\n",
+				      dest->vfwmark,
+				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+				      ntohs(dest->port));
 			list_del(&dest->n_list);
 			ip_vs_dst_reset(dest);
 			__ip_vs_unbind_svc(dest);
@@ -847,7 +868,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	/*
 	 * Check if the dest already exists in the list
 	 */
-	dest = ip_vs_lookup_dest(svc, daddr.ip, dport);
+	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
 	if (dest != NULL) {
 		IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
 		return -EEXIST;
@@ -857,7 +879,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	 * Check if the dest already exists in the trash and
 	 * is from the same service
 	 */
-	dest = ip_vs_trash_get_dest(svc, daddr.ip, dport);
+	dest = ip_vs_trash_get_dest(svc, &daddr, dport);
+
 	if (dest != NULL) {
 		IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
 			  "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
@@ -956,7 +979,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	/*
 	 *  Lookup the destination list
 	 */
-	dest = ip_vs_lookup_dest(svc, daddr.ip, dport);
+	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
 	if (dest == NULL) {
 		IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
 		return -ENOENT;
@@ -1054,7 +1078,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 	EnterFunction(2);
 
-	dest = ip_vs_lookup_dest(svc, udest->addr.ip, dport);
+	dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
 
 	if (dest == NULL) {
 		IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 3ce1093e067..40647edf102 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -383,8 +383,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 			 * If it is not found the connection will remain unbound
 			 * but still handled.
 			 */
-			dest = ip_vs_find_dest(s->daddr, s->dport,
-					       s->vaddr, s->vport,
+			dest = ip_vs_find_dest(AF_INET,
+					       (union nf_inet_addr *)&s->daddr,
+					       s->dport,
+					       (union nf_inet_addr *)&s->vaddr,
+					       s->vport,
 					       s->protocol);
 			/*  Set the approprite ativity flag */
 			if (s->protocol == IPPROTO_TCP) {
-- 
cgit v1.2.3


From 667a5f18162e803e30722af46ade1737e3b93198 Mon Sep 17 00:00:00 2001
From: Vince Busam <vbusam@google.com>
Date: Tue, 2 Sep 2008 15:55:49 +0200
Subject: IPVS: Convert procfs files for IPv6 entry output

Correctly output IPv6 connection/service/dest entries in procfs files.

Signed-off-by: Vince Busam <vbusam@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_conn.c | 37 +++++++++++++++++++++++++++----
 net/ipv4/ipvs/ip_vs_ctl.c  | 54 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index c2a42a62433..e7603d749c0 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -815,8 +815,22 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 	else {
 		const struct ip_vs_conn *cp = v;
 
-		seq_printf(seq,
-			"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			seq_printf(seq,
+				"%-3s " NIP6_FMT " %04X " NIP6_FMT
+				" %04X " NIP6_FMT " %04X %-11s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				NIP6(cp->caddr.in6), ntohs(cp->cport),
+				NIP6(cp->vaddr.in6), ntohs(cp->vport),
+				NIP6(cp->daddr.in6), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				(cp->timer.expires-jiffies)/HZ);
+		else
+#endif
+			seq_printf(seq,
+				"%-3s %08X %04X %08X %04X"
+				" %08X %04X %-11s %7lu\n",
 				ip_vs_proto_name(cp->protocol),
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
@@ -864,8 +878,23 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
 	else {
 		const struct ip_vs_conn *cp = v;
 
-		seq_printf(seq,
-			"%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n",
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			seq_printf(seq,
+				"%-3s " NIP6_FMT " %04X " NIP6_FMT
+				" %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				NIP6(cp->caddr.in6), ntohs(cp->cport),
+				NIP6(cp->vaddr.in6), ntohs(cp->vport),
+				NIP6(cp->daddr.in6), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_origin_name(cp->flags),
+				(cp->timer.expires-jiffies)/HZ);
+		else
+#endif
+			seq_printf(seq,
+				"%-3s %08X %04X %08X %04X "
+				"%08X %04X %-11s %-6s %7lu\n",
 				ip_vs_proto_name(cp->protocol),
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index bb0e1e3c885..25d9e98e31f 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1793,15 +1793,25 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		const struct ip_vs_iter *iter = seq->private;
 		const struct ip_vs_dest *dest;
 
-		if (iter->table == ip_vs_svc_table)
-			seq_printf(seq, "%s  %08X:%04X %s ",
-				   ip_vs_proto_name(svc->protocol),
-				   ntohl(svc->addr.ip),
-				   ntohs(svc->port),
-				   svc->scheduler->name);
-		else
+		if (iter->table == ip_vs_svc_table) {
+#ifdef CONFIG_IP_VS_IPV6
+			if (svc->af == AF_INET6)
+				seq_printf(seq, "%s  [" NIP6_FMT "]:%04X %s ",
+					   ip_vs_proto_name(svc->protocol),
+					   NIP6(svc->addr.in6),
+					   ntohs(svc->port),
+					   svc->scheduler->name);
+			else
+#endif
+				seq_printf(seq, "%s  %08X:%04X %s ",
+					   ip_vs_proto_name(svc->protocol),
+					   ntohl(svc->addr.ip),
+					   ntohs(svc->port),
+					   svc->scheduler->name);
+		} else {
 			seq_printf(seq, "FWM  %08X %s ",
 				   svc->fwmark, svc->scheduler->name);
+		}
 
 		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 			seq_printf(seq, "persistent %d %08X\n",
@@ -1811,13 +1821,29 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 			seq_putc(seq, '\n');
 
 		list_for_each_entry(dest, &svc->destinations, n_list) {
-			seq_printf(seq,
-				   "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
-				   ntohl(dest->addr.ip), ntohs(dest->port),
-				   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
-				   atomic_read(&dest->weight),
-				   atomic_read(&dest->activeconns),
-				   atomic_read(&dest->inactconns));
+#ifdef CONFIG_IP_VS_IPV6
+			if (dest->af == AF_INET6)
+				seq_printf(seq,
+					   "  -> [" NIP6_FMT "]:%04X"
+					   "      %-7s %-6d %-10d %-10d\n",
+					   NIP6(dest->addr.in6),
+					   ntohs(dest->port),
+					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+					   atomic_read(&dest->weight),
+					   atomic_read(&dest->activeconns),
+					   atomic_read(&dest->inactconns));
+			else
+#endif
+				seq_printf(seq,
+					   "  -> %08X:%04X      "
+					   "%-7s %-6d %-10d %-10d\n",
+					   ntohl(dest->addr.ip),
+					   ntohs(dest->port),
+					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+					   atomic_read(&dest->weight),
+					   atomic_read(&dest->activeconns),
+					   atomic_read(&dest->inactconns));
+
 		}
 	}
 	return 0;
-- 
cgit v1.2.3


From c6883f587341a3ed113856de8769d0992b4bbd85 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:50 +0200
Subject: IVPS: Disable sync daemon for IPv6 connections

Disable the sync daemon for IPv6 connections, works only with IPv4 for now.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 27bef1d67aa..5a7a81778b0 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -1321,7 +1321,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	 * encorage the standby servers to update the connections timeout
 	 */
 	atomic_inc(&cp->in_pkts);
-	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	if (af == AF_INET &&
+	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
 	      (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
-- 
cgit v1.2.3


From a0eb662f9ec8962928d937a185ad128db12c4637 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:51 +0200
Subject: IPVS: Turn off FTP application helper for IPv6

Immediately return from FTP application helper and do nothing when dealing
with IPv6 packets. IPv6 is not supported by this helper yet.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ftp.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
index 0c3fbe0de5f..2e7dbd8b73a 100644
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -147,6 +147,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	unsigned buf_len;
 	int ret;
 
+#ifdef CONFIG_IP_VS_IPV6
+	/* This application helper doesn't work with IPv6 yet,
+	 * so turn this into a no-op for IPv6 packets
+	 */
+	if (cp->af == AF_INET6)
+		return 1;
+#endif
+
 	*diff = 0;
 
 	/* Only useful for established sessions */
@@ -248,6 +256,14 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	__be16 port;
 	struct ip_vs_conn *n_cp;
 
+#ifdef CONFIG_IP_VS_IPV6
+	/* This application helper doesn't work with IPv6 yet,
+	 * so turn this into a no-op for IPv6 packets
+	 */
+	if (cp->af == AF_INET6)
+		return 1;
+#endif
+
 	/* no diff required for incoming packets */
 	*diff = 0;
 
-- 
cgit v1.2.3


From 09571c7ae30865adfa79dccd12a822a65d2c4b5a Mon Sep 17 00:00:00 2001
From: Vince Busam <vbusam@google.com>
Date: Tue, 2 Sep 2008 15:55:52 +0200
Subject: IPVS: Add function to determine if IPv6 address is local

Add __ip_vs_addr_is_local_v6() to find out if an IPv6 address belongs to a
local interface. Use this function to decide whether to set the
IP_VS_CONN_F_LOCALNODE flag for IPv6 destinations.

Signed-off-by: Vince Busam <vbusam@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 56 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 25d9e98e31f..640203a153c 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,10 @@
 
 #include <net/net_namespace.h>
 #include <net/ip.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#endif
 #include <net/route.h>
 #include <net/sock.h>
 #include <net/genetlink.h>
@@ -91,6 +95,26 @@ int ip_vs_get_debug_level(void)
 }
 #endif
 
+#ifdef CONFIG_IP_VS_IPV6
+/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
+static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+{
+	struct rt6_info *rt;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *addr,
+				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
+	};
+
+	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+	if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
+			return 1;
+
+	return 0;
+}
+#endif
 /*
  *	update_defense_level is called from keventd and from sysctl,
  *	so it needs to protect itself from softirqs
@@ -751,10 +775,18 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
 	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
 
 	/* check if local node and update the flags */
-	if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-		conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-			| IP_VS_CONN_F_LOCALNODE;
-	}
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6) {
+		if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
+			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+				| IP_VS_CONN_F_LOCALNODE;
+		}
+	} else
+#endif
+		if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
+			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+				| IP_VS_CONN_F_LOCALNODE;
+		}
 
 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
 	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
@@ -803,9 +835,19 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 
 	EnterFunction(2);
 
-	atype = inet_addr_type(&init_net, udest->addr.ip);
-	if (atype != RTN_LOCAL && atype != RTN_UNICAST)
-		return -EINVAL;
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6) {
+		atype = ipv6_addr_type(&udest->addr.in6);
+		if (!(atype & IPV6_ADDR_UNICAST) &&
+			!__ip_vs_addr_is_local_v6(&udest->addr.in6))
+			return -EINVAL;
+	} else
+#endif
+	{
+		atype = inet_addr_type(&init_net, udest->addr.ip);
+		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+			return -EINVAL;
+	}
 
 	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
 	if (dest == NULL) {
-- 
cgit v1.2.3


From cfc78c5a09241a3a9561466834996a7fb90c4228 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:53 +0200
Subject: IPVS: Adjust various debug outputs to use new macros

Adjust various debug outputs to use the new *_BUF macro variants for
correct output of v4/v6 addresses.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_conn.c      | 57 ++++++++++++++++++++++-------------------
 net/ipv4/ipvs/ip_vs_ctl.c       | 24 +++++++++--------
 net/ipv4/ipvs/ip_vs_proto_tcp.c | 45 ++++++++++++++++++--------------
 net/ipv4/ipvs/ip_vs_proto_udp.c | 15 ++++++-----
 4 files changed, 78 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index e7603d749c0..9a24332fbed 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -449,16 +449,16 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 		cp->flags |= atomic_read(&dest->conn_flags);
 	cp->dest = dest;
 
-	IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
-		  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
-		  "dest->refcnt:%d\n",
-		  ip_vs_proto_name(cp->protocol),
-		  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
-		  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
-		  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
-		  ip_vs_fwd_tag(cp), cp->state,
-		  cp->flags, atomic_read(&cp->refcnt),
-		  atomic_read(&dest->refcnt));
+	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
+		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		      "dest->refcnt:%d\n",
+		      ip_vs_proto_name(cp->protocol),
+		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+		      ip_vs_fwd_tag(cp), cp->state,
+		      cp->flags, atomic_read(&cp->refcnt),
+		      atomic_read(&dest->refcnt));
 
 	/* Update the connection counters */
 	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
@@ -512,16 +512,16 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
 	if (!dest)
 		return;
 
-	IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
-		  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
-		  "dest->refcnt:%d\n",
-		  ip_vs_proto_name(cp->protocol),
-		  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
-		  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
-		  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
-		  ip_vs_fwd_tag(cp), cp->state,
-		  cp->flags, atomic_read(&cp->refcnt),
-		  atomic_read(&dest->refcnt));
+	IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
+		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		      "dest->refcnt:%d\n",
+		      ip_vs_proto_name(cp->protocol),
+		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+		      ip_vs_fwd_tag(cp), cp->state,
+		      cp->flags, atomic_read(&cp->refcnt),
+		      atomic_read(&dest->refcnt));
 
 	/* Update the connection counters */
 	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
@@ -574,13 +574,16 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
 	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
 	    (sysctl_ip_vs_expire_quiescent_template &&
 	     (atomic_read(&dest->weight) == 0))) {
-		IP_VS_DBG(9, "check_template: dest not available for "
-			  "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
-			  "-> d:%u.%u.%u.%u:%d\n",
-			  ip_vs_proto_name(ct->protocol),
-			  NIPQUAD(ct->caddr.ip), ntohs(ct->cport),
-			  NIPQUAD(ct->vaddr.ip), ntohs(ct->vport),
-			  NIPQUAD(ct->daddr.ip), ntohs(ct->dport));
+		IP_VS_DBG_BUF(9, "check_template: dest not available for "
+			      "protocol %s s:%s:%d v:%s:%d "
+			      "-> d:%s:%d\n",
+			      ip_vs_proto_name(ct->protocol),
+			      IP_VS_DBG_ADDR(ct->af, &ct->caddr),
+			      ntohs(ct->cport),
+			      IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
+			      ntohs(ct->vport),
+			      IP_VS_DBG_ADDR(ct->af, &ct->daddr),
+			      ntohs(ct->dport));
 
 		/*
 		 * Invalidate the connection template
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 640203a153c..6dbc527285f 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -924,13 +924,14 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	dest = ip_vs_trash_get_dest(svc, &daddr, dport);
 
 	if (dest != NULL) {
-		IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
-			  "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
-			  NIPQUAD(daddr), ntohs(dport),
-			  atomic_read(&dest->refcnt),
-			  dest->vfwmark,
-			  NIPQUAD(dest->vaddr.ip),
-			  ntohs(dest->vport));
+		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
+			      "dest->refcnt=%d, service %u/%s:%u\n",
+			      IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
+			      atomic_read(&dest->refcnt),
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
+			      ntohs(dest->vport));
+
 		__ip_vs_update_dest(svc, dest, udest);
 
 		/*
@@ -1076,10 +1077,11 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
 		atomic_dec(&dest->svc->refcnt);
 		kfree(dest);
 	} else {
-		IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
-			  "dest->refcnt=%d\n",
-			  NIPQUAD(dest->addr.ip), ntohs(dest->port),
-			  atomic_read(&dest->refcnt));
+		IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
+			      "dest->refcnt=%d\n",
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port),
+			      atomic_read(&dest->refcnt));
 		list_add(&dest->n_list, &ip_vs_dest_trash);
 		atomic_inc(&dest->refcnt);
 	}
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 3da2bb05ee7..de8ed73997c 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -490,19 +490,23 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 	if (new_state != cp->state) {
 		struct ip_vs_dest *dest = cp->dest;
 
-		IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
-			  "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
-			  pp->name,
-			  (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
-			  th->syn? 'S' : '.',
-			  th->fin? 'F' : '.',
-			  th->ack? 'A' : '.',
-			  th->rst? 'R' : '.',
-			  NIPQUAD(cp->daddr.ip), ntohs(cp->dport),
-			  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
-			  tcp_state_name(cp->state),
-			  tcp_state_name(new_state),
-			  atomic_read(&cp->refcnt));
+		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
+			      "%s:%d state: %s->%s conn->refcnt:%d\n",
+			      pp->name,
+			      ((state_off == TCP_DIR_OUTPUT) ?
+			       "output " : "input "),
+			      th->syn ? 'S' : '.',
+			      th->fin ? 'F' : '.',
+			      th->ack ? 'A' : '.',
+			      th->rst ? 'R' : '.',
+			      IP_VS_DBG_ADDR(cp->af, &cp->daddr),
+			      ntohs(cp->dport),
+			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+			      ntohs(cp->cport),
+			      tcp_state_name(cp->state),
+			      tcp_state_name(new_state),
+			      atomic_read(&cp->refcnt));
+
 		if (dest) {
 			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
 			    (new_state != IP_VS_TCP_S_ESTABLISHED)) {
@@ -623,12 +627,15 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 				break;
 			spin_unlock(&tcp_app_lock);
 
-			IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
-				  "%u.%u.%u.%u:%u to app %s on port %u\n",
-				  __func__,
-				  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
-				  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
-				  inc->name, ntohs(inc->port));
+			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+				      "%s:%u to app %s on port %u\n",
+				      __func__,
+				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+				      ntohs(cp->cport),
+				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+				      ntohs(cp->vport),
+				      inc->name, ntohs(inc->port));
+
 			cp->app = inc;
 			if (inc->init_conn)
 				result = inc->init_conn(inc, cp);
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index fd8bd934cc0..5f2073e41cf 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -408,12 +408,15 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
 				break;
 			spin_unlock(&udp_app_lock);
 
-			IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
-				  "%u.%u.%u.%u:%u to app %s on port %u\n",
-				  __func__,
-				  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
-				  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
-				  inc->name, ntohs(inc->port));
+			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+				      "%s:%u to app %s on port %u\n",
+				      __func__,
+				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+				      ntohs(cp->cport),
+				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+				      ntohs(cp->vport),
+				      inc->name, ntohs(inc->port));
+
 			cp->app = inc;
 			if (inc->init_conn)
 				result = inc->init_conn(inc, cp);
-- 
cgit v1.2.3


From 473b23d37b697c66ac0bfcfdcc9badf718e25d2a Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:54 +0200
Subject: IPVS: Activate IPv6 Netfilter hooks

Register the previously defined or adapted netfilter hook functions for
IPv6 as PF_INET6 hooks.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5a7a81778b0..7d3de9db5ac 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -1413,6 +1413,43 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_POST_ROUTING,
 		.priority       = NF_IP_PRI_NAT_SRC-1,
 	},
+#ifdef CONFIG_IP_VS_IPV6
+	/* After packet filtering, forward packet through VS/DR, VS/TUN,
+	 * or VS/NAT(change destination), so that filtering rules can be
+	 * applied to IPVS. */
+	{
+		.hook		= ip_vs_in,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_LOCAL_IN,
+		.priority       = 100,
+	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_out,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_FORWARD,
+		.priority       = 100,
+	},
+	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+	{
+		.hook		= ip_vs_forward_icmp_v6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_FORWARD,
+		.priority       = 99,
+	},
+	/* Before the netfilter connection tracking, exit from POST_ROUTING */
+	{
+		.hook		= ip_vs_post_routing,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_POST_ROUTING,
+		.priority       = NF_IP6_PRI_NAT_SRC-1,
+	},
+#endif
 };
 
 
-- 
cgit v1.2.3


From f94fd041402e4e70d2b4ed00008b9bb857e6ae87 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Tue, 2 Sep 2008 15:55:55 +0200
Subject: IPVS: Allow adding IPv6 services from userspace

Allow adding IPv6 services through the genetlink interface and add checks
to see if the chosen scheduler is supported with IPv6 and whether the
supplied prefix length is sane. Make sure the service count exported via
the sockopt interface only counts IPv4 services.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 53 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 6dbc527285f..7f89c588e58 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1177,6 +1177,19 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		goto out_mod_dec;
 	}
 
+#ifdef CONFIG_IP_VS_IPV6
+	if (u->af == AF_INET6) {
+		if (!sched->supports_ipv6) {
+			ret = -EAFNOSUPPORT;
+			goto out_err;
+		}
+		if ((u->netmask < 1) || (u->netmask > 128)) {
+			ret = -EINVAL;
+			goto out_err;
+		}
+	}
+#endif
+
 	svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
 	if (svc == NULL) {
 		IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
@@ -1214,7 +1227,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		atomic_inc(&ip_vs_nullsvc_counter);
 
 	ip_vs_new_estimator(&svc->stats);
-	ip_vs_num_services++;
+
+	/* Count only IPv4 services for old get/setsockopt interface */
+	if (svc->af == AF_INET)
+		ip_vs_num_services++;
 
 	/* Hash the service into the service table */
 	write_lock_bh(&__ip_vs_svc_lock);
@@ -1265,6 +1281,19 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	}
 	old_sched = sched;
 
+#ifdef CONFIG_IP_VS_IPV6
+	if (u->af == AF_INET6) {
+		if (!sched->supports_ipv6) {
+			ret = EAFNOSUPPORT;
+			goto out;
+		}
+		if ((u->netmask < 1) || (u->netmask > 128)) {
+			ret = EINVAL;
+			goto out;
+		}
+	}
+#endif
+
 	write_lock_bh(&__ip_vs_svc_lock);
 
 	/*
@@ -1329,7 +1358,10 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	struct ip_vs_dest *dest, *nxt;
 	struct ip_vs_scheduler *old_sched;
 
-	ip_vs_num_services--;
+	/* Count only IPv4 services for old get/setsockopt interface */
+	if (svc->af == AF_INET)
+		ip_vs_num_services--;
+
 	ip_vs_kill_estimator(&svc->stats);
 
 	/* Unbind scheduler */
@@ -2212,6 +2244,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
 
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			/* Only expose IPv4 entries to old interface */
+			if (svc->af != AF_INET)
+				continue;
+
 			if (count >= get->num_services)
 				goto out;
 			memset(&entry, 0, sizeof(entry));
@@ -2227,6 +2263,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
 
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			/* Only expose IPv4 entries to old interface */
+			if (svc->af != AF_INET)
+				continue;
+
 			if (count >= get->num_services)
 				goto out;
 			memset(&entry, 0, sizeof(entry));
@@ -2584,7 +2624,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	if (!nl_service)
 		return -EMSGSIZE;
 
-	NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, AF_INET);
+	NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
 
 	if (svc->fwmark) {
 		NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
@@ -2691,8 +2731,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		return -EINVAL;
 
 	usvc->af = nla_get_u16(nla_af);
-	/* For now, only support IPv4 */
-	if (nla_get_u16(nla_af) != AF_INET)
+#ifdef CONFIG_IP_VS_IPV6
+	if (usvc->af != AF_INET && usvc->af != AF_INET6)
+#else
+	if (usvc->af != AF_INET)
+#endif
 		return -EAFNOSUPPORT;
 
 	if (nla_fwmark) {
-- 
cgit v1.2.3


From 4856c84c1358b79852743ac64e50c1e9d5118f05 Mon Sep 17 00:00:00 2001
From: Malcolm Turnbull <malcolm@loadbalancer.org>
Date: Fri, 5 Sep 2008 11:17:13 +1000
Subject: ipvs: load balance IPv4 connections from a local process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows IPVS to load balance connections made by a local process.
For example a proxy server running locally.

External client --> pound:443 -> Local:443 --> IPVS:80 --> RealServer

Signed-off-by: Siim Põder <siim@p6drad-teel.net>
Signed-off-by: Malcolm Turnbull <malcolm@loadbalancer.org>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c      | 224 +++++++++++++++++++++++-----------------
 net/ipv4/ipvs/ip_vs_proto_tcp.c |   4 +-
 2 files changed, 134 insertions(+), 94 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 7d3de9db5ac..26e3d99bbee 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -651,12 +651,53 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 }
 #endif
 
+/* Handle relevant response ICMP messages - forward to the right
+ * destination host. Used for NAT and local client.
+ */
+static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph,
+				struct iphdr *cih, struct ip_vs_conn *cp,
+				struct ip_vs_protocol *pp,
+				unsigned int offset, unsigned int ihl)
+{
+	unsigned int verdict = NF_DROP;
+
+	if (IP_VS_FWD_METHOD(cp) != 0) {
+		IP_VS_ERR("shouldn't reach here, because the box is on the "
+			  "half connection in the tun/dr module.\n");
+	}
+
+	/* Ensure the checksum is correct */
+	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+		/* Failed checksum! */
+		IP_VS_DBG(1,
+			  "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
+			  NIPQUAD(iph->saddr));
+		goto out;
+	}
+
+	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+		offset += 2 * sizeof(__u16);
+	if (!skb_make_writable(skb, offset))
+		goto out;
+
+	ip_vs_nat_icmp(skb, pp, cp, 1);
+
+	/* do the statistics and put it back */
+	ip_vs_out_stats(cp, skb);
+
+	skb->ipvs_property = 1;
+	verdict = NF_ACCEPT;
+
+out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+
 /*
  *	Handle ICMP messages in the inside-to-outside direction (outgoing).
- *	Find any that might be relevant, check against existing connections,
- *	forward to the right destination host if relevant.
+ *	Find any that might be relevant, check against existing connections.
  *	Currently handles error types - unreachable, quench, ttl exceeded.
- *	(Only used in VS/NAT)
  */
 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 {
@@ -666,7 +707,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 	struct ip_vs_iphdr ciph;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
-	unsigned int offset, ihl, verdict;
+	unsigned int offset, ihl;
 
 	*related = 1;
 
@@ -725,38 +766,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 	if (!cp)
 		return NF_ACCEPT;
 
-	verdict = NF_DROP;
-
-	if (IP_VS_FWD_METHOD(cp) != 0) {
-		IP_VS_ERR("shouldn't reach here, because the box is on the "
-			  "half connection in the tun/dr module.\n");
-	}
-
-	/* Ensure the checksum is correct */
-	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
-		/* Failed checksum! */
-		IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
-			  NIPQUAD(iph->saddr));
-		goto out;
-	}
-
-	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
-		offset += 2 * sizeof(__u16);
-	if (!skb_make_writable(skb, offset))
-		goto out;
-
-	ip_vs_nat_icmp(skb, pp, cp, 1);
-
-	/* do the statistics and put it back */
-	ip_vs_out_stats(cp, skb);
-
-	skb->ipvs_property = 1;
-	verdict = NF_ACCEPT;
-
-  out:
-	__ip_vs_conn_put(cp);
-
-	return verdict;
+	return handle_response_icmp(skb, iph, cih, cp, pp, offset, ihl);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -875,10 +885,76 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
 	return th->rst;
 }
 
+/* Handle response packets: rewrite addresses and send away...
+ * Used for NAT and local client.
+ */
+static unsigned int
+handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+		struct ip_vs_conn *cp, int ihl)
+{
+	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+
+	if (!skb_make_writable(skb, ihl))
+		goto drop;
+
+	/* mangle the packet */
+	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
+		goto drop;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+	else
+#endif
+	{
+		ip_hdr(skb)->saddr = cp->vaddr.ip;
+		ip_send_check(ip_hdr(skb));
+	}
+
+	/* For policy routing, packets originating from this
+	 * machine itself may be routed differently to packets
+	 * passing through.  We want this packet to be routed as
+	 * if it came from this machine itself.  So re-compute
+	 * the routing information.
+	 */
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (ip6_route_me_harder(skb) != 0)
+			goto drop;
+	} else
+#endif
+		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			goto drop;
+
+	/* For policy routing, packets originating from this
+	 * machine itself may be routed differently to packets
+	 * passing through.  We want this packet to be routed as
+	 * if it came from this machine itself.  So re-compute
+	 * the routing information.
+	 */
+	if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+		goto drop;
+
+	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+
+	ip_vs_out_stats(cp, skb);
+	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+	ip_vs_conn_put(cp);
+
+	skb->ipvs_property = 1;
+
+	LeaveFunction(11);
+	return NF_ACCEPT;
+
+drop:
+	ip_vs_conn_put(cp);
+	kfree_skb(skb);
+	return NF_STOLEN;
+}
+
 /*
  *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
- *	Check if outgoing packet belongs to the established ip_vs_conn,
- *      rewrite addresses of the packet and send it on its way...
+ *	Check if outgoing packet belongs to the established ip_vs_conn.
  */
 static unsigned int
 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
@@ -987,55 +1063,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 		return NF_ACCEPT;
 	}
 
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
-
-	if (!skb_make_writable(skb, iph.len))
-		goto drop;
-
-	/* mangle the packet */
-	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
-		goto drop;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
-	else
-#endif
-	{
-		ip_hdr(skb)->saddr = cp->vaddr.ip;
-		ip_send_check(ip_hdr(skb));
-	}
-
-	/* For policy routing, packets originating from this
-	 * machine itself may be routed differently to packets
-	 * passing through.  We want this packet to be routed as
-	 * if it came from this machine itself.  So re-compute
-	 * the routing information.
-	 */
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6) {
-		if (ip6_route_me_harder(skb) != 0)
-			goto drop;
-	} else
-#endif
-		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-			goto drop;
-
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
-
-	ip_vs_out_stats(cp, skb);
-	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-	ip_vs_conn_put(cp);
-
-	skb->ipvs_property = 1;
-
-	LeaveFunction(11);
-	return NF_ACCEPT;
-
-  drop:
-	ip_vs_conn_put(cp);
-	kfree_skb(skb);
-	return NF_STOLEN;
+	return handle_response(af, skb, pp, cp, iph.len);
 }
 
 
@@ -1111,8 +1139,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
 	cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
-	if (!cp)
+	if (!cp) {
+		/* The packet could also belong to a local client */
+		cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+		if (cp)
+			return handle_response_icmp(skb, iph, cih, cp, pp,
+						    offset, ihl);
 		return NF_ACCEPT;
+	}
 
 	verdict = NF_DROP;
 
@@ -1244,11 +1278,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 
 	/*
-	 *	Big tappo: only PACKET_HOST (neither loopback nor mcasts)
-	 *	... don't know why 1st test DOES NOT include 2nd (?)
+	 *	Big tappo: only PACKET_HOST, including loopback for local client
+	 *	Don't handle local packets on IPv6 for now
 	 */
-	if (unlikely(skb->pkt_type != PACKET_HOST
-		     || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
+	if (unlikely(skb->pkt_type != PACKET_HOST ||
+		     (af == AF_INET6 || (skb->dev->flags & IFF_LOOPBACK ||
+					 skb->sk)))) {
 		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
 			      skb->pkt_type,
 			      iph.protocol,
@@ -1277,6 +1312,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(!cp)) {
 		int v;
 
+		/* For local client packets, it could be a response */
+		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+		if (cp)
+			return handle_response(af, skb, pp, cp, iph.len);
+
 		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
 			return v;
 	}
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index de8ed73997c..808e8be0280 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -166,7 +166,7 @@ tcp_snat_handler(struct sk_buff *skb,
 	tcph->source = cp->vport;
 
 	/* Adjust TCP checksums */
-	if (!cp->app) {
+	if (!cp->app && (tcph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
@@ -235,7 +235,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	/*
 	 *	Adjust TCP checksums
 	 */
-	if (!cp->app) {
+	if (!cp->app && (tcph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
-- 
cgit v1.2.3


From f2428ed5e7bc89c7716ead22748cb5d076e204f0 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Fri, 5 Sep 2008 11:17:14 +1000
Subject: ipvs: load balance ipv6 connections from a local process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows IPVS to load balance IPv6 connections made by a local process.
For example a proxy server running locally.

External client --> pound:443 -> Local:443 --> IPVS:80 --> RealServer

This is an extenstion to the IPv4 work done in this area
by Siim Põder and Malcolm Turnbull.

Cc: Siim Põder <siim@p6drad-teel.net>
Cc: Malcolm Turnbull <malcolm@loadbalancer.org>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 91 +++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 26e3d99bbee..05797a5ce15 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -654,8 +654,9 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 /* Handle relevant response ICMP messages - forward to the right
  * destination host. Used for NAT and local client.
  */
-static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph,
-				struct iphdr *cih, struct ip_vs_conn *cp,
+static int handle_response_icmp(int af, struct sk_buff *skb,
+				union nf_inet_addr *snet,
+				__u8 protocol, struct ip_vs_conn *cp,
 				struct ip_vs_protocol *pp,
 				unsigned int offset, unsigned int ihl)
 {
@@ -669,18 +670,22 @@ static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph,
 	/* Ensure the checksum is correct */
 	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 		/* Failed checksum! */
-		IP_VS_DBG(1,
-			  "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
-			  NIPQUAD(iph->saddr));
+		IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
+			      IP_VS_DBG_ADDR(af, snet));
 		goto out;
 	}
 
-	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+	if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
 		offset += 2 * sizeof(__u16);
 	if (!skb_make_writable(skb, offset))
 		goto out;
 
-	ip_vs_nat_icmp(skb, pp, cp, 1);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+	else
+#endif
+		ip_vs_nat_icmp(skb, pp, cp, 1);
 
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
@@ -708,6 +713,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	unsigned int offset, ihl;
+	union nf_inet_addr snet;
 
 	*related = 1;
 
@@ -766,7 +772,9 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 	if (!cp)
 		return NF_ACCEPT;
 
-	return handle_response_icmp(skb, iph, cih, cp, pp, offset, ihl);
+	snet.ip = iph->saddr;
+	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
+				    pp, offset, ihl);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -779,7 +787,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 	struct ip_vs_iphdr ciph;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
-	unsigned int offset, verdict;
+	unsigned int offset;
+	union nf_inet_addr snet;
 
 	*related = 1;
 
@@ -838,40 +847,9 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 	if (!cp)
 		return NF_ACCEPT;
 
-	verdict = NF_DROP;
-
-	if (IP_VS_FWD_METHOD(cp) != 0) {
-		IP_VS_ERR("shouldn't reach here, because the box is on the "
-			  "half connection in the tun/dr module.\n");
-	}
-
-	/* Ensure the checksum is correct */
-	if (!skb_csum_unnecessary(skb)
-	    && ip_vs_checksum_complete(skb, sizeof(struct ipv6hdr))) {
-		/* Failed checksum! */
-		IP_VS_DBG(1, "Forward ICMPv6: failed checksum from "
-			  NIP6_FMT "!\n",
-			  NIP6(iph->saddr));
-		goto out;
-	}
-
-	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
-		offset += 2 * sizeof(__u16);
-	if (!skb_make_writable(skb, offset))
-		goto out;
-
-	ip_vs_nat_icmp_v6(skb, pp, cp, 1);
-
-	/* do the statistics and put it back */
-	ip_vs_out_stats(cp, skb);
-
-	skb->ipvs_property = 1;
-	verdict = NF_ACCEPT;
-
-out:
-	__ip_vs_conn_put(cp);
-
-	return verdict;
+	snet.in6 = iph->saddr;
+	return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
+				    pp, offset, sizeof(struct ipv6hdr));
 }
 #endif
 
@@ -1055,7 +1033,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 							  ICMP_DEST_UNREACH,
 							  ICMP_PORT_UNREACH, 0);
 					return NF_DROP;
-				}
+			}
 			}
 		}
 		IP_VS_DBG_PKT(12, pp, skb, 0,
@@ -1083,6 +1061,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	unsigned int offset, ihl, verdict;
+	union nf_inet_addr snet;
 
 	*related = 1;
 
@@ -1142,9 +1121,12 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (!cp) {
 		/* The packet could also belong to a local client */
 		cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
-		if (cp)
-			return handle_response_icmp(skb, iph, cih, cp, pp,
+		if (cp) {
+			snet.ip = iph->saddr;
+			return handle_response_icmp(AF_INET, skb, &snet,
+						    cih->protocol, cp, pp,
 						    offset, ihl);
+		}
 		return NF_ACCEPT;
 	}
 
@@ -1183,6 +1165,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	unsigned int offset, verdict;
+	union nf_inet_addr snet;
 
 	*related = 1;
 
@@ -1240,8 +1223,18 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
 	cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
-	if (!cp)
+	if (!cp) {
+		/* The packet could also belong to a local client */
+		cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+		if (cp) {
+			snet.in6 = iph->saddr;
+			return handle_response_icmp(AF_INET6, skb, &snet,
+						    cih->nexthdr,
+						    cp, pp, offset,
+						    sizeof(struct ipv6hdr));
+		}
 		return NF_ACCEPT;
+	}
 
 	verdict = NF_DROP;
 
@@ -1281,9 +1274,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	 *	Big tappo: only PACKET_HOST, including loopback for local client
 	 *	Don't handle local packets on IPv6 for now
 	 */
-	if (unlikely(skb->pkt_type != PACKET_HOST ||
-		     (af == AF_INET6 || (skb->dev->flags & IFF_LOOPBACK ||
-					 skb->sk)))) {
+	if (unlikely(skb->pkt_type != PACKET_HOST)) {
 		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
 			      skb->pkt_type,
 			      iph.protocol,
-- 
cgit v1.2.3


From f59ac0481660e66cec67f1d6b024e78b9dc715fe Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Fri, 29 Aug 2008 16:26:43 -0700
Subject: cfg80211: keep track of supported interface modes

It is obviously good for userspace to know up front which
interface modes a given piece of hardware might support (even
if adding such an interface might fail later because of
concurrency issues), so let's make cfg80211 aware of that.
For good measure, disallow adding interfaces in all other
modes so drivers don't forget to announce support for one mode
when they add it.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Stephen Blackheath <tramp.enshrine.stephen@blacksapphire.com>
Signed-off-by: Ivo van Doorn <IvDoorn@gmail.com>
Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c    |  7 +++++++
 net/wireless/core.c    |  9 ++++++++-
 net/wireless/nl80211.c | 22 ++++++++++++++++++++--
 3 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 638b75f36e2..396cfb2d0f4 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1675,6 +1675,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		}
 	}
 
+	/* if low-level driver supports AP, we also support VLAN */
+	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP))
+		local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN);
+
+	/* mac80211 always supports monitor */
+	local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
+
 	result = wiphy_register(local->hw.wiphy);
 	if (result < 0)
 		return result;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index f1da0b93bc5..7e995ac06a0 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1,7 +1,7 @@
 /*
  * This is the linux wireless configuration interface.
  *
- * Copyright 2006, 2007		Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2006-2008		Johannes Berg <johannes@sipsolutions.net>
  */
 
 #include <linux/if.h>
@@ -259,6 +259,13 @@ int wiphy_register(struct wiphy *wiphy)
 	struct ieee80211_supported_band *sband;
 	bool have_band = false;
 	int i;
+	u16 ifmodes = wiphy->interface_modes;
+
+	/* sanity check ifmodes */
+	WARN_ON(!ifmodes);
+	ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1;
+	if (WARN_ON(ifmodes != wiphy->interface_modes))
+		wiphy->interface_modes = ifmodes;
 
 	/* sanity check supported bands/channels */
 	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4d6c02afd6f..77880ba8b61 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -113,10 +113,12 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	struct nlattr *nl_bands, *nl_band;
 	struct nlattr *nl_freqs, *nl_freq;
 	struct nlattr *nl_rates, *nl_rate;
+	struct nlattr *nl_modes;
 	enum ieee80211_band band;
 	struct ieee80211_channel *chan;
 	struct ieee80211_rate *rate;
 	int i;
+	u16 ifmodes = dev->wiphy.interface_modes;
 
 	hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY);
 	if (!hdr)
@@ -125,6 +127,20 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->idx);
 	NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
 
+	nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
+	if (!nl_modes)
+		goto nla_put_failure;
+
+	i = 0;
+	while (ifmodes) {
+		if (ifmodes & 1)
+			NLA_PUT_FLAG(msg, i);
+		ifmodes >>= 1;
+		i++;
+	}
+
+	nla_nest_end(msg, nl_modes);
+
 	nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS);
 	if (!nl_bands)
 		goto nla_put_failure;
@@ -415,7 +431,8 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	ifindex = dev->ifindex;
 	dev_put(dev);
 
-	if (!drv->ops->change_virtual_intf) {
+	if (!drv->ops->change_virtual_intf ||
+	    !(drv->wiphy.interface_modes & (1 << type))) {
 		err = -EOPNOTSUPP;
 		goto unlock;
 	}
@@ -462,7 +479,8 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 	if (IS_ERR(drv))
 		return PTR_ERR(drv);
 
-	if (!drv->ops->add_virtual_intf) {
+	if (!drv->ops->add_virtual_intf ||
+	    !(drv->wiphy.interface_modes & (1 << type))) {
 		err = -EOPNOTSUPP;
 		goto unlock;
 	}
-- 
cgit v1.2.3


From cd9fe6c4f0afe334862c871bf5d32770daa748ec Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Fri, 5 Sep 2008 13:46:00 +0200
Subject: ipvs: Use pointer to address from sync message

We want a pointer to it, not the value casted to a pointer.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_sync.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 40647edf102..28237a5f62e 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -397,11 +397,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 					flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
 			cp = ip_vs_conn_new(AF_INET, s->protocol,
-					    (union nf_inet_addr *)s->caddr,
+					    (union nf_inet_addr *)&s->caddr,
 					    s->cport,
-					    (union nf_inet_addr *)s->vaddr,
+					    (union nf_inet_addr *)&s->vaddr,
 					    s->vport,
-					    (union nf_inet_addr *)s->daddr,
+					    (union nf_inet_addr *)&s->daddr,
 					    s->dport,
 					    flags, dest);
 			if (dest)
-- 
cgit v1.2.3


From a5ba4bf2732c85d8c95e0432966f79aa2b159478 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Fri, 5 Sep 2008 13:47:37 +0200
Subject: ipvs: Return negative error values from ip_vs_edit_service()

Like the other code in this function does.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7f89c588e58..d2dc05a843f 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1284,11 +1284,11 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 #ifdef CONFIG_IP_VS_IPV6
 	if (u->af == AF_INET6) {
 		if (!sched->supports_ipv6) {
-			ret = EAFNOSUPPORT;
+			ret = -EAFNOSUPPORT;
 			goto out;
 		}
 		if ((u->netmask < 1) || (u->netmask > 128)) {
-			ret = EINVAL;
+			ret = -EINVAL;
 			goto out;
 		}
 	}
-- 
cgit v1.2.3


From 77eb851630bba8ea9962a1b2f01b23bd5d57c58e Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Fri, 5 Sep 2008 14:43:00 +0200
Subject: ipvs: Mark tcp/udp v4 and v6 debug functions static

They are only used in this file, so they should be static

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_proto.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
index 50f6215beda..b06da1c3445 100644
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -151,7 +151,7 @@ const char * ip_vs_state_name(__u16 proto, int state)
 }
 
 
-void
+static void
 ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
 			     const struct sk_buff *skb,
 			     int offset,
@@ -190,7 +190,7 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
 }
 
 #ifdef CONFIG_IP_VS_IPV6
-void
+static void
 ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 			     const struct sk_buff *skb,
 			     int offset,
-- 
cgit v1.2.3


From 3bfb92f4073aa829f8e67e459d54c79306ddbd73 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Fri, 5 Sep 2008 16:53:49 +0200
Subject: ipvs: Reject ipv6 link-local addresses for destinations

We can't use non-local link-local addresses for destinations, without
knowing the interface on which we can reach the address. Reject them for
now.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index d2dc05a843f..e53efe41f01 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -838,7 +838,8 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 #ifdef CONFIG_IP_VS_IPV6
 	if (svc->af == AF_INET6) {
 		atype = ipv6_addr_type(&udest->addr.in6);
-		if (!(atype & IPV6_ADDR_UNICAST) &&
+		if ((!(atype & IPV6_ADDR_UNICAST) ||
+			atype & IPV6_ADDR_LINKLOCAL) &&
 			!__ip_vs_addr_is_local_v6(&udest->addr.in6))
 			return -EINVAL;
 	} else
-- 
cgit v1.2.3


From 5af149cc34143c4e24abcc6355b29b3161eff3b8 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Mon, 8 Sep 2008 09:34:45 +1000
Subject: IPVS: fix bogus indentation

Sorry, this was my error.
Thanks to Julius Volz for pointing it out.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julius Volz <juliusv@google.com>
---
 net/ipv4/ipvs/ip_vs_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 05797a5ce15..1f4f3b94359 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -1033,7 +1033,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 							  ICMP_DEST_UNREACH,
 							  ICMP_PORT_UNREACH, 0);
 					return NF_DROP;
-			}
+				}
 			}
 		}
 		IP_VS_DBG_PKT(12, pp, skb, 0,
-- 
cgit v1.2.3


From 178f5e494e3c0252d06a9b1473016addff71e01e Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Mon, 8 Sep 2008 09:34:46 +1000
Subject: IPVS: use ipv6_addr_copy()

It is standard to use ipv6_addr_copy() to fill in
the in6 element of a union nf_inet_addr snet.

Thanks to Julius Volz for pointing this out.

Cc: Brian Haley <brian.haley@hp.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julius Volz <juliusv@google.com>
---
 net/ipv4/ipvs/ip_vs_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 1f4f3b94359..f5180ac56be 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -847,7 +847,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 	if (!cp)
 		return NF_ACCEPT;
 
-	snet.in6 = iph->saddr;
+	ipv6_addr_copy(&snet.in6, &iph->saddr);
 	return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
 				    pp, offset, sizeof(struct ipv6hdr));
 }
@@ -1227,7 +1227,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 		/* The packet could also belong to a local client */
 		cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
 		if (cp) {
-			snet.in6 = iph->saddr;
+			ipv6_addr_copy(&snet.in6, &iph->saddr);
 			return handle_response_icmp(AF_INET6, skb, &snet,
 						    cih->nexthdr,
 						    cp, pp, offset,
-- 
cgit v1.2.3


From 9818babc8fd9a542978a235f1c1786f948cbac68 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Wed, 3 Sep 2008 23:42:19 +0300
Subject: mac80211: Fix low bit rate in IBSS

This patch fixes regression in iwlwifi IBSS rate scaling caused by patch:

    commit 6bc37c06bc424bcf3f944e6a79e2d5bb537e02ed
    Author: Vladimir Koutny <vlado@work.ksp.sk>
    Date:   Fri Jun 13 16:50:44 2008 +0200

        mac80211: eliminate IBSS warning in rate_lowest_index()

An IBSS station is added in prepare_for_handlers where the rate scaling was
initialized only with single rate matching the received packet.
The correct rate scale information should be updated only in
ieee80211_rx_bss_info function where beacon is parsed. Because
of coding error the rate info was left untouched.
If a beacon has triggered the connection the rate remined 1Mbps.
This patch fixes this coding error

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Cc: Vladimir Koutny <vlado@work.ksp.sk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 7d53382f1a5..75510a9f3f1 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2595,7 +2595,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 							rx_status->band);
 
 		prev_rates = sta->supp_rates[rx_status->band];
-		sta->supp_rates[rx_status->band] &= supp_rates;
+		sta->supp_rates[rx_status->band] = supp_rates;
 		if (sta->supp_rates[rx_status->band] == 0) {
 			/* No matching rates - this should not really happen.
 			 * Make sure that at least one rate is marked
-- 
cgit v1.2.3


From 8e1535d51bc93fbe9b3ded6555680044bc571d19 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Wed, 3 Sep 2008 23:42:20 +0300
Subject: mac80211: Fix rate scale initialization in IBSS

This patch address some IBSS rate issues introduced or not covered
by "mac80211: eliminate IBSS warning in rate_lowest_index()" and
"cfg80211 API for channels/bitrates, mac80211 and driver conversion".

This patch:
1. Moves addition of IBSS station from
prepare_for_handlers to ieee80211_rx_bss_info when triggered from beacon
eliminating bogus supported rates.
2. Initialize properly supported rates also in IBSS merging
3. Ensure that mandatory rates are always added into supported
rates. This is needed in case when station addition is triggered from
non beacon/probe packet. Some management frames need to be sent
4. Remove initialization of supported rates from self rates. This path
was dead code after 6bc37c06bc4 and in general incorrect.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Cc: Vladimir Koutny <vlado@work.ksp.sk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 90 +++++++++++++++++++++++++++++++++++++----------------
 net/mac80211/rx.c   |  4 ---
 2 files changed, 64 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 75510a9f3f1..c396c3522f7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2557,6 +2557,33 @@ u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 	return supp_rates;
 }
 
+static u64 ieee80211_sta_get_mandatory_rates(struct ieee80211_local *local,
+					enum ieee80211_band band)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *bitrates;
+	u64 mandatory_rates;
+	enum ieee80211_rate_flags mandatory_flag;
+	int i;
+
+	sband = local->hw.wiphy->bands[band];
+	if (!sband) {
+		WARN_ON(1);
+		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	}
+
+	if (band == IEEE80211_BAND_2GHZ)
+		mandatory_flag = IEEE80211_RATE_MANDATORY_B;
+	else
+		mandatory_flag = IEEE80211_RATE_MANDATORY_A;
+
+	bitrates = sband->bitrates;
+	mandatory_rates = 0;
+	for (i = 0; i < sband->n_bitrates; i++)
+		if (bitrates[i].flags & mandatory_flag)
+			mandatory_rates |= BIT(i);
+	return mandatory_rates;
+}
 
 static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_mgmt *mgmt,
@@ -2568,9 +2595,11 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	int freq, clen;
 	struct ieee80211_sta_bss *bss;
 	struct sta_info *sta;
-	u64 beacon_timestamp, rx_timestamp;
 	struct ieee80211_channel *channel;
+	u64 beacon_timestamp, rx_timestamp;
+	u64 supp_rates = 0;
 	bool beacon = ieee80211_is_beacon(mgmt->frame_control);
+	enum ieee80211_band band = rx_status->band;
 	DECLARE_MAC_BUF(mac);
 	DECLARE_MAC_BUF(mac2);
 
@@ -2578,30 +2607,41 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) && elems->mesh_id &&
 	    elems->mesh_config && mesh_matches_local(elems, sdata)) {
-		u64 rates = ieee80211_sta_get_rates(local, elems,
-						rx_status->band);
+		supp_rates = ieee80211_sta_get_rates(local, elems, band);
 
-		mesh_neighbour_update(mgmt->sa, rates, sdata,
+		mesh_neighbour_update(mgmt->sa, supp_rates, sdata,
 				      mesh_peer_accepts_plinks(elems));
 	}
 
 	rcu_read_lock();
 
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && elems->supp_rates &&
-	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0 &&
-	    (sta = sta_info_get(local, mgmt->sa))) {
-		u64 prev_rates;
-		u64 supp_rates = ieee80211_sta_get_rates(local, elems,
-							rx_status->band);
-
-		prev_rates = sta->supp_rates[rx_status->band];
-		sta->supp_rates[rx_status->band] = supp_rates;
-		if (sta->supp_rates[rx_status->band] == 0) {
-			/* No matching rates - this should not really happen.
-			 * Make sure that at least one rate is marked
-			 * supported to avoid issues with TX rate ctrl. */
-			sta->supp_rates[rx_status->band] =
-				sdata->u.sta.supp_rates_bits[rx_status->band];
+	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
+
+		supp_rates = ieee80211_sta_get_rates(local, elems, band);
+
+		sta = sta_info_get(local, mgmt->sa);
+		if (sta) {
+			u64 prev_rates;
+
+			prev_rates = sta->supp_rates[band];
+			/* make sure mandatory rates are always added */
+			sta->supp_rates[band] = supp_rates |
+				ieee80211_sta_get_mandatory_rates(local, band);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+			if (sta->supp_rates[band] != prev_rates)
+				printk(KERN_DEBUG "%s: updated supp_rates set "
+				    "for %s based on beacon info (0x%llx | "
+				    "0x%llx -> 0x%llx)\n",
+				    sdata->dev->name, print_mac(mac, sta->addr),
+				    (unsigned long long) prev_rates,
+				    (unsigned long long) supp_rates,
+				    (unsigned long long) sta->supp_rates[band]);
+#endif
+		} else {
+			ieee80211_ibss_add_sta(sdata, NULL, mgmt->bssid,
+					       mgmt->sa, supp_rates);
 		}
 	}
 
@@ -2683,7 +2723,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		bss->supp_rates_len += clen;
 	}
 
-	bss->band = rx_status->band;
+	bss->band = band;
 
 	bss->timestamp = beacon_timestamp;
 	bss->last_update = jiffies;
@@ -2738,7 +2778,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 			 * e.g: at 1 MBit that means mactime is 192 usec earlier
 			 * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
 			 */
-			int rate = local->hw.wiphy->bands[rx_status->band]->
+			int rate = local->hw.wiphy->bands[band]->
 					bitrates[rx_status->rate_idx].bitrate;
 			rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
 		} else if (local && local->ops && local->ops->get_tsf)
@@ -2766,7 +2806,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 			ieee80211_sta_join_ibss(sdata, &sdata->u.sta, bss);
 			ieee80211_ibss_add_sta(sdata, NULL,
 					       mgmt->bssid, mgmt->sa,
-					       BIT(rx_status->rate_idx));
+					       supp_rates);
 		}
 	}
 
@@ -3032,7 +3072,6 @@ void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *
 	kfree_skb(skb);
 }
 
-
 static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 					 struct sk_buff *skb)
 {
@@ -4316,10 +4355,9 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 
 	set_sta_flags(sta, WLAN_STA_AUTHORIZED);
 
-	if (supp_rates)
-		sta->supp_rates[band] = supp_rates;
-	else
-		sta->supp_rates[band] = sdata->u.sta.supp_rates_bits[band];
+	/* make sure mandatory rates are always added */
+	sta->supp_rates[band] = supp_rates |
+			ieee80211_sta_get_mandatory_rates(local, band);
 
 	rate_control_rate_init(sta, local);
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fd83ef760a3..7e09b30dd39 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1743,10 +1743,6 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 		if (!bssid)
 			return 0;
 		if (ieee80211_is_beacon(hdr->frame_control)) {
-			if (!rx->sta)
-				rx->sta = ieee80211_ibss_add_sta(sdata,
-						rx->skb, bssid, hdr->addr2,
-						BIT(rx->status->rate_idx));
 			return 1;
 		}
 		else if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) {
-- 
cgit v1.2.3


From 701b9cb37bae3d50dbe5f345a7cdbd2648bf7df6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 4 Sep 2008 09:24:53 -0700
Subject: mac80211: add missing kernel-doc

Fix mac80211 kernel-doc missing struct field:

Warning(linux-2.6.27-rc1-git2//net/mac80211/sta_info.h:329): No description found for parameter 'tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/sta_info.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 109db787ccb..4a581a5b576 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -204,6 +204,7 @@ struct sta_ampdu_mlme {
  * @tx_fragments: number of transmitted MPDUs
  * @txrate_idx: TBD
  * @last_txrate_idx: TBD
+ * @tid_seq: TBD
  * @wme_tx_queue: TBD
  * @ampdu_mlme: TBD
  * @timer_to_tid: identity mapping to ID timers
-- 
cgit v1.2.3


From e2a6b85247aacc52d6ba0d9b37a99b8d1a3e0d83 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 8 Sep 2008 16:10:02 -0700
Subject: net: Enable TSO if supported by at least one device

As it stands users of netdev_compute_features (e.g., bridges/bonding)
will only enable TSO if all consituent devices support it.  This
is unnecessarily pessimistic since even on devices that do not
support hardware TSO and SG, emulated TSO still performs to a par
with TSO off.

This patch enables TSO if at least on constituent device supports
it in hardware.

The direct beneficiaries will be virtualisation that uses bridging
since this means that TSO will always be enabled for communication
from the host to the guests.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 60c51f76588..abef86ec4cb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4663,6 +4663,12 @@ int netdev_compute_features(unsigned long all, unsigned long one)
 		one |= NETIF_F_GSO_SOFTWARE;
 	one |= NETIF_F_GSO;
 
+	/*
+	 * If even one device supports a GSO protocol with software fallback,
+	 * enable it for all.
+	 */
+	all |= one & NETIF_F_GSO_SOFTWARE;
+
 	/* If even one device supports robust GSO, enable it for all. */
 	if (one & NETIF_F_GSO_ROBUST)
 		all |= NETIF_F_GSO_ROBUST;
-- 
cgit v1.2.3


From 5337407c673e2c7c66a84b9838d55a45a760ecff Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 8 Sep 2008 16:17:42 -0700
Subject: warn: Turn the netdev timeout WARN_ON() into a WARN()

this patch turns the netdev timeout WARN_ON_ONCE() into a WARN_ONCE(),
so that the device and driver names are inside the warning message.
This helps automated tools like kerneloops.org to collect the data
and do statistics, as well as making it more likely that humans
cut-n-paste the important message as part of a bugreport.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 9634091ee2f..ec0a0839ce5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -215,10 +215,9 @@ static void dev_watchdog(unsigned long arg)
 			    time_after(jiffies, (dev->trans_start +
 						 dev->watchdog_timeo))) {
 				char drivername[64];
-				printk(KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n",
+				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n",
 				       dev->name, netdev_drivername(dev, drivername, 64));
 				dev->tx_timeout(dev);
-				WARN_ON_ONCE(1);
 			}
 			if (!mod_timer(&dev->watchdog_timer,
 				       round_jiffies(jiffies +
-- 
cgit v1.2.3


From 4aa678ba44aa35759c04f300afbc97d3dab5faa2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 8 Sep 2008 16:19:58 -0700
Subject: netns bridge: allow bridges in netns!

Bridge as netdevice doesn't cross netns boundaries.

Bridge ports and bridge itself live in same netns.

Notifiers are fixed.

netns propagated from userspace socket for setup and teardown.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Stephen Hemminger <shemming@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c   |  3 ++-
 net/bridge/br_if.c       | 11 ++++++-----
 net/bridge/br_ioctl.c    | 20 ++++++++++----------
 net/bridge/br_netlink.c  | 15 +++++----------
 net/bridge/br_notify.c   |  3 ---
 net/bridge/br_private.h  |  4 ++--
 net/bridge/br_stp_bpdu.c |  3 ---
 7 files changed, 25 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 4f52c3d50eb..22ba8632196 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -178,5 +178,6 @@ void br_dev_setup(struct net_device *dev)
 	dev->priv_flags = IFF_EBRIDGE;
 
 	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
-			NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX;
+			NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
+			NETIF_F_NETNS_LOCAL;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 63c18aacde8..66c4f7122cf 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -168,7 +168,7 @@ static void del_br(struct net_bridge *br)
 	unregister_netdevice(br->dev);
 }
 
-static struct net_device *new_bridge_dev(const char *name)
+static struct net_device *new_bridge_dev(struct net *net, const char *name)
 {
 	struct net_bridge *br;
 	struct net_device *dev;
@@ -178,6 +178,7 @@ static struct net_device *new_bridge_dev(const char *name)
 
 	if (!dev)
 		return NULL;
+	dev_net_set(dev, net);
 
 	br = netdev_priv(dev);
 	br->dev = dev;
@@ -262,12 +263,12 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	return p;
 }
 
-int br_add_bridge(const char *name)
+int br_add_bridge(struct net *net, const char *name)
 {
 	struct net_device *dev;
 	int ret;
 
-	dev = new_bridge_dev(name);
+	dev = new_bridge_dev(net, name);
 	if (!dev)
 		return -ENOMEM;
 
@@ -294,13 +295,13 @@ out_free:
 	goto out;
 }
 
-int br_del_bridge(const char *name)
+int br_del_bridge(struct net *net, const char *name)
 {
 	struct net_device *dev;
 	int ret = 0;
 
 	rtnl_lock();
-	dev = __dev_get_by_name(&init_net, name);
+	dev = __dev_get_by_name(net, name);
 	if (dev == NULL)
 		ret =  -ENXIO; 	/* Could not find device */
 
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index eeee218eed8..3ec1c636e62 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -21,12 +21,12 @@
 #include "br_private.h"
 
 /* called with RTNL */
-static int get_bridge_ifindices(int *indices, int num)
+static int get_bridge_ifindices(struct net *net, int *indices, int num)
 {
 	struct net_device *dev;
 	int i = 0;
 
-	for_each_netdev(&init_net, dev) {
+	for_each_netdev(net, dev) {
 		if (i >= num)
 			break;
 		if (dev->priv_flags & IFF_EBRIDGE)
@@ -89,7 +89,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
-	dev = dev_get_by_index(&init_net, ifindex);
+	dev = dev_get_by_index(dev_net(br->dev), ifindex);
 	if (dev == NULL)
 		return -EINVAL;
 
@@ -309,7 +309,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	return -EOPNOTSUPP;
 }
 
-static int old_deviceless(void __user *uarg)
+static int old_deviceless(struct net *net, void __user *uarg)
 {
 	unsigned long args[3];
 
@@ -331,7 +331,7 @@ static int old_deviceless(void __user *uarg)
 		if (indices == NULL)
 			return -ENOMEM;
 
-		args[2] = get_bridge_ifindices(indices, args[2]);
+		args[2] = get_bridge_ifindices(net, indices, args[2]);
 
 		ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int))
 			? -EFAULT : args[2];
@@ -354,9 +354,9 @@ static int old_deviceless(void __user *uarg)
 		buf[IFNAMSIZ-1] = 0;
 
 		if (args[0] == BRCTL_ADD_BRIDGE)
-			return br_add_bridge(buf);
+			return br_add_bridge(net, buf);
 
-		return br_del_bridge(buf);
+		return br_del_bridge(net, buf);
 	}
 	}
 
@@ -368,7 +368,7 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uar
 	switch (cmd) {
 	case SIOCGIFBR:
 	case SIOCSIFBR:
-		return old_deviceless(uarg);
+		return old_deviceless(net, uarg);
 
 	case SIOCBRADDBR:
 	case SIOCBRDELBR:
@@ -383,9 +383,9 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uar
 
 		buf[IFNAMSIZ-1] = 0;
 		if (cmd == SIOCBRADDBR)
-			return br_add_bridge(buf);
+			return br_add_bridge(net, buf);
 
-		return br_del_bridge(buf);
+		return br_del_bridge(net, buf);
 	}
 	}
 	return -EOPNOTSUPP;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index f155e6ce8a2..ba7be195803 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -82,6 +82,7 @@ nla_put_failure:
  */
 void br_ifinfo_notify(int event, struct net_bridge_port *port)
 {
+	struct net *net = dev_net(port->dev);
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
@@ -97,10 +98,10 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, &init_net,0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
 errout:
 	if (err < 0)
-		rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err);
+		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
 }
 
 /*
@@ -112,11 +113,8 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	struct net_device *dev;
 	int idx;
 
-	if (net != &init_net)
-		return 0;
-
 	idx = 0;
-	for_each_netdev(&init_net, dev) {
+	for_each_netdev(net, dev) {
 		/* not a bridge port */
 		if (dev->br_port == NULL || idx < cb->args[0])
 			goto skip;
@@ -147,9 +145,6 @@ static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
 	struct net_bridge_port *p;
 	u8 new_state;
 
-	if (net != &init_net)
-		return -EINVAL;
-
 	if (nlmsg_len(nlh) < sizeof(*ifm))
 		return -EINVAL;
 
@@ -165,7 +160,7 @@ static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
 	if (new_state > BR_STATE_BLOCKING)
 		return -EINVAL;
 
-	dev = __dev_get_by_index(&init_net, ifm->ifi_index);
+	dev = __dev_get_by_index(net, ifm->ifi_index);
 	if (!dev)
 		return -ENODEV;
 
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 76340bdd052..763a3ec292e 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -35,9 +35,6 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 	struct net_bridge_port *p = dev->br_port;
 	struct net_bridge *br;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
 	/* not a port of a bridge */
 	if (p == NULL)
 		return NOTIFY_DONE;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c3dc18ddc04..51eaeaaa58c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -178,8 +178,8 @@ extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb);
 
 /* br_if.c */
 extern void br_port_carrier_check(struct net_bridge_port *p);
-extern int br_add_bridge(const char *name);
-extern int br_del_bridge(const char *name);
+extern int br_add_bridge(struct net *net, const char *name);
+extern int br_del_bridge(struct net *net, const char *name);
 extern void br_cleanup_bridges(void);
 extern int br_add_if(struct net_bridge *br,
 	      struct net_device *dev);
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 8b200f96f72..81ae40b3f65 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -140,9 +140,6 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
 	struct net_bridge *br;
 	const unsigned char *buf;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		goto err;
-
 	if (!p)
 		goto err;
 
-- 
cgit v1.2.3


From 712d6954e3998d0de2840d8130941e8042541246 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 8 Sep 2008 16:20:18 -0700
Subject: netns bridge: cleanup bridges during netns stop

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Stephen Hemminger <shemming@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c         | 22 ++++++++++++++++------
 net/bridge/br_if.c      |  4 ++--
 net/bridge/br_private.h |  2 +-
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br.c b/net/bridge/br.c
index 573acdf6f9f..4d2c1f1cb52 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -28,6 +28,10 @@ static const struct stp_proto br_stp_proto = {
 	.rcv	= br_stp_rcv,
 };
 
+static struct pernet_operations br_net_ops = {
+	.exit	= br_net_exit,
+};
+
 static int __init br_init(void)
 {
 	int err;
@@ -42,18 +46,22 @@ static int __init br_init(void)
 	if (err)
 		goto err_out;
 
-	err = br_netfilter_init();
+	err = register_pernet_subsys(&br_net_ops);
 	if (err)
 		goto err_out1;
 
-	err = register_netdevice_notifier(&br_device_notifier);
+	err = br_netfilter_init();
 	if (err)
 		goto err_out2;
 
-	err = br_netlink_init();
+	err = register_netdevice_notifier(&br_device_notifier);
 	if (err)
 		goto err_out3;
 
+	err = br_netlink_init();
+	if (err)
+		goto err_out4;
+
 	brioctl_set(br_ioctl_deviceless_stub);
 	br_handle_frame_hook = br_handle_frame;
 
@@ -61,10 +69,12 @@ static int __init br_init(void)
 	br_fdb_put_hook = br_fdb_put;
 
 	return 0;
-err_out3:
+err_out4:
 	unregister_netdevice_notifier(&br_device_notifier);
-err_out2:
+err_out3:
 	br_netfilter_fini();
+err_out2:
+	unregister_pernet_subsys(&br_net_ops);
 err_out1:
 	br_fdb_fini();
 err_out:
@@ -80,7 +90,7 @@ static void __exit br_deinit(void)
 	unregister_netdevice_notifier(&br_device_notifier);
 	brioctl_set(NULL);
 
-	br_cleanup_bridges();
+	unregister_pernet_subsys(&br_net_ops);
 
 	synchronize_net();
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 66c4f7122cf..573e20f7dba 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -446,13 +446,13 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
 	return 0;
 }
 
-void __exit br_cleanup_bridges(void)
+void br_net_exit(struct net *net)
 {
 	struct net_device *dev;
 
 	rtnl_lock();
 restart:
-	for_each_netdev(&init_net, dev) {
+	for_each_netdev(net, dev) {
 		if (dev->priv_flags & IFF_EBRIDGE) {
 			del_br(dev->priv);
 			goto restart;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 51eaeaaa58c..b6c3b71974d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -180,7 +180,7 @@ extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb);
 extern void br_port_carrier_check(struct net_bridge_port *p);
 extern int br_add_bridge(struct net *net, const char *name);
 extern int br_del_bridge(struct net *net, const char *name);
-extern void br_cleanup_bridges(void);
+extern void br_net_exit(struct net *net);
 extern int br_add_if(struct net_bridge *br,
 	      struct net_device *dev);
 extern int br_del_if(struct net_bridge *br,
-- 
cgit v1.2.3


From 503e81f65adac596a0275ea0230f2ae1fd64c301 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Mon, 8 Sep 2008 12:04:21 +1000
Subject: ipvs: handle PARTIAL_CHECKSUM

Now that LVS can load balance locally generated traffic, packets may come
from the loopback device and thus may have a partial checksum.

The existing code allows for the case where there is no checksum at all for
TCP, however Herbert Xu has confirmed that this is not legal.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julius Volz <juliusv@google.com>
---
 net/ipv4/ipvs/ip_vs_proto_tcp.c | 37 +++++++++++++++++++++++++++++++++++--
 net/ipv4/ipvs/ip_vs_proto_udp.c | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 70 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 808e8be0280..537f616776d 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -134,12 +134,34 @@ tcp_fast_csum_update(int af, struct tcphdr *tcph,
 }
 
 
+static inline void
+tcp_partial_csum_update(int af, struct tcphdr *tcph,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcph->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(tcph->check))));
+	else
+#endif
+	tcph->check =
+		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(tcph->check))));
+}
+
+
 static int
 tcp_snat_handler(struct sk_buff *skb,
 		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
+	int oldlen;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -147,6 +169,7 @@ tcp_snat_handler(struct sk_buff *skb,
 	else
 #endif
 		tcphoff = ip_hdrlen(skb);
+	oldlen = skb->len - tcphoff;
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
@@ -166,7 +189,11 @@ tcp_snat_handler(struct sk_buff *skb,
 	tcph->source = cp->vport;
 
 	/* Adjust TCP checksums */
-	if (!cp->app && (tcph->check != 0)) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - tcphoff));
+	} else if (!cp->app) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
@@ -204,6 +231,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 {
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
+	int oldlen;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -211,6 +239,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	else
 #endif
 		tcphoff = ip_hdrlen(skb);
+	oldlen = skb->len - tcphoff;
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
@@ -235,7 +264,11 @@ tcp_dnat_handler(struct sk_buff *skb,
 	/*
 	 *	Adjust TCP checksums
 	 */
-	if (!cp->app && (tcph->check != 0)) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - tcphoff));
+	} else if (!cp->app) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 5f2073e41cf..e3ee26bd1de 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -141,12 +141,34 @@ udp_fast_csum_update(int af, struct udphdr *uhdr,
 		uhdr->check = CSUM_MANGLED_0;
 }
 
+static inline void
+udp_partial_csum_update(int af, struct udphdr *uhdr,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		uhdr->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(uhdr->check))));
+	else
+#endif
+	uhdr->check =
+		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(uhdr->check))));
+}
+
+
 static int
 udp_snat_handler(struct sk_buff *skb,
 		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
 	struct udphdr *udph;
 	unsigned int udphoff;
+	int oldlen;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -154,6 +176,7 @@ udp_snat_handler(struct sk_buff *skb,
 	else
 #endif
 		udphoff = ip_hdrlen(skb);
+	oldlen = skb->len - udphoff;
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
@@ -177,7 +200,11 @@ udp_snat_handler(struct sk_buff *skb,
 	/*
 	 *	Adjust UDP checksums
 	 */
-	if (!cp->app && (udph->check != 0)) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - udphoff));
+	} else if (!cp->app && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
@@ -216,6 +243,7 @@ udp_dnat_handler(struct sk_buff *skb,
 {
 	struct udphdr *udph;
 	unsigned int udphoff;
+	int oldlen;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -223,6 +251,7 @@ udp_dnat_handler(struct sk_buff *skb,
 	else
 #endif
 		udphoff = ip_hdrlen(skb);
+	oldlen = skb->len - udphoff;
 
 	/* csum_check requires unshared skb */
 	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
@@ -247,7 +276,11 @@ udp_dnat_handler(struct sk_buff *skb,
 	/*
 	 *	Adjust UDP checksums
 	 */
-	if (!cp->app && (udph->check != 0)) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - udphoff));
+	} else if (!cp->app && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
-- 
cgit v1.2.3


From 9d7f2a2b1aa9e55537a053c68bdbd119fc479dd3 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Mon, 8 Sep 2008 14:55:42 +0200
Subject: IPVS: Remove incorrect ip_route_me_harder(), fix IPv6

Remove an incorrect ip_route_me_harder() that was probably a result of
merging my IPv6 patches with the local client patches. With this, IPv6+NAT
are working again.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index f5180ac56be..bdc92d73fbe 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -904,15 +904,6 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 			goto drop;
 
-	/* For policy routing, packets originating from this
-	 * machine itself may be routed differently to packets
-	 * passing through.  We want this packet to be routed as
-	 * if it came from this machine itself.  So re-compute
-	 * the routing information.
-	 */
-	if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-		goto drop;
-
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 
 	ip_vs_out_stats(cp, skb);
-- 
cgit v1.2.3


From 2206a3f5b75be5dadf11541961bd7c924857eb5d Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Mon, 8 Sep 2008 13:38:11 +0200
Subject: ipvs: Restrict connection table size via Kconfig

Instead of checking the value in include/net/ip_vs.h, we can just
restrict the range in our Kconfig file. This will prevent values outside
of the range early.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Reviewed-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 794cecb249a..de6004de80b 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -41,7 +41,8 @@ config	IP_VS_DEBUG
 
 config	IP_VS_TAB_BITS
 	int "IPVS connection table size (the Nth power of 2)"
-	default "12" 
+	range 8 20
+	default 12
 	---help---
 	  The IPVS connection hash table uses the chaining scheme to handle
 	  hash collisions. Using a big IPVS connection hash table will greatly
-- 
cgit v1.2.3


From e9c0ce232e7a36daae1ca08282609d7f0c57c567 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Mon, 8 Sep 2008 13:39:04 +0200
Subject: ipvs: Embed user stats structure into kernel stats structure

Instead of duplicating the fields, integrate a user stats structure into
the kernel stats structure. This is more robust when the members are
changed, because they are now automatically kept in sync.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Reviewed-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 30 +++++++++++++-------------
 net/ipv4/ipvs/ip_vs_ctl.c  | 53 ++++++++++++++++++----------------------------
 net/ipv4/ipvs/ip_vs_est.c  | 40 +++++++++++++++++-----------------
 3 files changed, 56 insertions(+), 67 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index bdc92d73fbe..80a4fcf33a5 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -102,18 +102,18 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 	struct ip_vs_dest *dest = cp->dest;
 	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		spin_lock(&dest->stats.lock);
-		dest->stats.inpkts++;
-		dest->stats.inbytes += skb->len;
+		dest->stats.ustats.inpkts++;
+		dest->stats.ustats.inbytes += skb->len;
 		spin_unlock(&dest->stats.lock);
 
 		spin_lock(&dest->svc->stats.lock);
-		dest->svc->stats.inpkts++;
-		dest->svc->stats.inbytes += skb->len;
+		dest->svc->stats.ustats.inpkts++;
+		dest->svc->stats.ustats.inbytes += skb->len;
 		spin_unlock(&dest->svc->stats.lock);
 
 		spin_lock(&ip_vs_stats.lock);
-		ip_vs_stats.inpkts++;
-		ip_vs_stats.inbytes += skb->len;
+		ip_vs_stats.ustats.inpkts++;
+		ip_vs_stats.ustats.inbytes += skb->len;
 		spin_unlock(&ip_vs_stats.lock);
 	}
 }
@@ -125,18 +125,18 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 	struct ip_vs_dest *dest = cp->dest;
 	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		spin_lock(&dest->stats.lock);
-		dest->stats.outpkts++;
-		dest->stats.outbytes += skb->len;
+		dest->stats.ustats.outpkts++;
+		dest->stats.ustats.outbytes += skb->len;
 		spin_unlock(&dest->stats.lock);
 
 		spin_lock(&dest->svc->stats.lock);
-		dest->svc->stats.outpkts++;
-		dest->svc->stats.outbytes += skb->len;
+		dest->svc->stats.ustats.outpkts++;
+		dest->svc->stats.ustats.outbytes += skb->len;
 		spin_unlock(&dest->svc->stats.lock);
 
 		spin_lock(&ip_vs_stats.lock);
-		ip_vs_stats.outpkts++;
-		ip_vs_stats.outbytes += skb->len;
+		ip_vs_stats.ustats.outpkts++;
+		ip_vs_stats.ustats.outbytes += skb->len;
 		spin_unlock(&ip_vs_stats.lock);
 	}
 }
@@ -146,15 +146,15 @@ static inline void
 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 {
 	spin_lock(&cp->dest->stats.lock);
-	cp->dest->stats.conns++;
+	cp->dest->stats.ustats.conns++;
 	spin_unlock(&cp->dest->stats.lock);
 
 	spin_lock(&svc->stats.lock);
-	svc->stats.conns++;
+	svc->stats.ustats.conns++;
 	spin_unlock(&svc->stats.lock);
 
 	spin_lock(&ip_vs_stats.lock);
-	ip_vs_stats.conns++;
+	ip_vs_stats.ustats.conns++;
 	spin_unlock(&ip_vs_stats.lock);
 }
 
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index e53efe41f01..993a83fb0d5 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -744,18 +744,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
 {
 	spin_lock_bh(&stats->lock);
 
-	stats->conns = 0;
-	stats->inpkts = 0;
-	stats->outpkts = 0;
-	stats->inbytes = 0;
-	stats->outbytes = 0;
-
-	stats->cps = 0;
-	stats->inpps = 0;
-	stats->outpps = 0;
-	stats->inbps = 0;
-	stats->outbps = 0;
-
+	memset(&stats->ustats, 0, sizeof(stats->ustats));
 	ip_vs_zero_estimator(stats);
 
 	spin_unlock_bh(&stats->lock);
@@ -1964,20 +1953,20 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
 		   "   Conns  Packets  Packets            Bytes            Bytes\n");
 
 	spin_lock_bh(&ip_vs_stats.lock);
-	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
-		   ip_vs_stats.inpkts, ip_vs_stats.outpkts,
-		   (unsigned long long) ip_vs_stats.inbytes,
-		   (unsigned long long) ip_vs_stats.outbytes);
+	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
+		   ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
+		   (unsigned long long) ip_vs_stats.ustats.inbytes,
+		   (unsigned long long) ip_vs_stats.ustats.outbytes);
 
 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
 	seq_puts(seq,
 		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
 	seq_printf(seq,"%8X %8X %8X %16X %16X\n",
-			ip_vs_stats.cps,
-			ip_vs_stats.inpps,
-			ip_vs_stats.outpps,
-			ip_vs_stats.inbps,
-			ip_vs_stats.outbps);
+			ip_vs_stats.ustats.cps,
+			ip_vs_stats.ustats.inpps,
+			ip_vs_stats.ustats.outpps,
+			ip_vs_stats.ustats.inbps,
+			ip_vs_stats.ustats.outbps);
 	spin_unlock_bh(&ip_vs_stats.lock);
 
 	return 0;
@@ -2215,7 +2204,7 @@ static void
 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
 {
 	spin_lock_bh(&src->lock);
-	memcpy(dst, src, (char*)&src->lock - (char*)src);
+	memcpy(dst, &src->ustats, sizeof(*dst));
 	spin_unlock_bh(&src->lock);
 }
 
@@ -2591,16 +2580,16 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
 
 	spin_lock_bh(&stats->lock);
 
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->conns);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts);
-	NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes);
-	NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->cps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->inpps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->outpps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->inbps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->outbps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
 
 	spin_unlock_bh(&stats->lock);
 
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 4fb620ec208..2eb2860dabb 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -65,37 +65,37 @@ static void estimation_timer(unsigned long arg)
 		s = container_of(e, struct ip_vs_stats, est);
 
 		spin_lock(&s->lock);
-		n_conns = s->conns;
-		n_inpkts = s->inpkts;
-		n_outpkts = s->outpkts;
-		n_inbytes = s->inbytes;
-		n_outbytes = s->outbytes;
+		n_conns = s->ustats.conns;
+		n_inpkts = s->ustats.inpkts;
+		n_outpkts = s->ustats.outpkts;
+		n_inbytes = s->ustats.inbytes;
+		n_outbytes = s->ustats.outbytes;
 
 		/* scaled by 2^10, but divided 2 seconds */
 		rate = (n_conns - e->last_conns)<<9;
 		e->last_conns = n_conns;
 		e->cps += ((long)rate - (long)e->cps)>>2;
-		s->cps = (e->cps+0x1FF)>>10;
+		s->ustats.cps = (e->cps+0x1FF)>>10;
 
 		rate = (n_inpkts - e->last_inpkts)<<9;
 		e->last_inpkts = n_inpkts;
 		e->inpps += ((long)rate - (long)e->inpps)>>2;
-		s->inpps = (e->inpps+0x1FF)>>10;
+		s->ustats.inpps = (e->inpps+0x1FF)>>10;
 
 		rate = (n_outpkts - e->last_outpkts)<<9;
 		e->last_outpkts = n_outpkts;
 		e->outpps += ((long)rate - (long)e->outpps)>>2;
-		s->outpps = (e->outpps+0x1FF)>>10;
+		s->ustats.outpps = (e->outpps+0x1FF)>>10;
 
 		rate = (n_inbytes - e->last_inbytes)<<4;
 		e->last_inbytes = n_inbytes;
 		e->inbps += ((long)rate - (long)e->inbps)>>2;
-		s->inbps = (e->inbps+0xF)>>5;
+		s->ustats.inbps = (e->inbps+0xF)>>5;
 
 		rate = (n_outbytes - e->last_outbytes)<<4;
 		e->last_outbytes = n_outbytes;
 		e->outbps += ((long)rate - (long)e->outbps)>>2;
-		s->outbps = (e->outbps+0xF)>>5;
+		s->ustats.outbps = (e->outbps+0xF)>>5;
 		spin_unlock(&s->lock);
 	}
 	spin_unlock(&est_lock);
@@ -108,20 +108,20 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
 
 	INIT_LIST_HEAD(&est->list);
 
-	est->last_conns = stats->conns;
-	est->cps = stats->cps<<10;
+	est->last_conns = stats->ustats.conns;
+	est->cps = stats->ustats.cps<<10;
 
-	est->last_inpkts = stats->inpkts;
-	est->inpps = stats->inpps<<10;
+	est->last_inpkts = stats->ustats.inpkts;
+	est->inpps = stats->ustats.inpps<<10;
 
-	est->last_outpkts = stats->outpkts;
-	est->outpps = stats->outpps<<10;
+	est->last_outpkts = stats->ustats.outpkts;
+	est->outpps = stats->ustats.outpps<<10;
 
-	est->last_inbytes = stats->inbytes;
-	est->inbps = stats->inbps<<5;
+	est->last_inbytes = stats->ustats.inbytes;
+	est->inbps = stats->ustats.inbps<<5;
 
-	est->last_outbytes = stats->outbytes;
-	est->outbps = stats->outbps<<5;
+	est->last_outbytes = stats->ustats.outbytes;
+	est->outbps = stats->ustats.outbps<<5;
 
 	spin_lock_bh(&est_lock);
 	list_add(&est->list, &est_list);
-- 
cgit v1.2.3


From 410e27a49bb98bc7fa3ff5fc05cc313817b9f253 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 9 Sep 2008 13:27:22 +0200
Subject: This reverts "Merge branch 'dccp' of
 git://eden-feed.erg.abdn.ac.uk/dccp_exp" as it accentally contained the wrong
 set of patches. These will be submitted separately. Signed-off-by: Gerrit
 Renker <gerrit@erg.abdn.ac.uk>

---
 net/dccp/Kconfig                    |    3 +
 net/dccp/Makefile                   |    5 +-
 net/dccp/ackvec.c                   |  619 ++++++------
 net/dccp/ackvec.h                   |  204 ++--
 net/dccp/ccid.c                     |  101 +-
 net/dccp/ccid.h                     |  113 +--
 net/dccp/ccids/Kconfig              |   30 +-
 net/dccp/ccids/ccid2.c              |  622 +++++++-----
 net/dccp/ccids/ccid2.h              |   63 +-
 net/dccp/ccids/ccid3.c              |  762 +++++++++------
 net/dccp/ccids/ccid3.h              |  153 +--
 net/dccp/ccids/lib/loss_interval.c  |   30 +-
 net/dccp/ccids/lib/loss_interval.h  |    4 +-
 net/dccp/ccids/lib/packet_history.c |  282 +++---
 net/dccp/ccids/lib/packet_history.h |   78 +-
 net/dccp/ccids/lib/tfrc.h           |   16 -
 net/dccp/ccids/lib/tfrc_equation.c  |   29 +-
 net/dccp/dccp.h                     |  104 +-
 net/dccp/diag.c                     |    2 +-
 net/dccp/feat.c                     | 1805 +++++++++--------------------------
 net/dccp/feat.h                     |  144 +--
 net/dccp/input.c                    |  164 ++--
 net/dccp/ipv4.c                     |    4 +-
 net/dccp/ipv6.c                     |    4 +-
 net/dccp/minisocks.c                |   87 +-
 net/dccp/options.c                  |  341 ++++---
 net/dccp/output.c                   |  279 ++----
 net/dccp/probe.c                    |   75 +-
 net/dccp/proto.c                    |  281 ++----
 net/dccp/qpolicy.c                  |  137 ---
 net/dccp/sysctl.c                   |   64 +-
 net/dccp/timer.c                    |   42 +-
 net/ipv4/tcp_input.c                |   17 +-
 33 files changed, 2801 insertions(+), 3863 deletions(-)
 delete mode 100644 net/dccp/qpolicy.c

(limited to 'net')

diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 206c16ad9c3..7aa2a7acc7e 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -25,6 +25,9 @@ config INET_DCCP_DIAG
 	def_tristate y if (IP_DCCP = y && INET_DIAG = y)
 	def_tristate m
 
+config IP_DCCP_ACKVEC
+	bool
+
 source "net/dccp/ccids/Kconfig"
 
 menu "DCCP Kernel Hacking"
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 0c1c9af2bf7..f4f8793aaff 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,6 @@
 obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
 
-dccp-y := ccid.o feat.o input.o minisocks.o options.o \
-	  qpolicy.o output.o proto.o timer.o ackvec.o
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
 
 dccp_ipv4-y := ipv4.o
 
@@ -9,6 +8,8 @@ dccp_ipv4-y := ipv4.o
 obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
 dccp_ipv6-y := ipv6.o
 
+dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
+
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
 obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
 
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 41819848bdd..1e8be246ad1 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,375 +1,445 @@
 /*
  *  net/dccp/ackvec.c
  *
- *  An implementation of Ack Vectors for the DCCP protocol
- *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ *  An implementation of the DCCP protocol
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  *
  *      This program is free software; you can redistribute it and/or modify it
  *      under the terms of the GNU General Public License as published by the
  *      Free Software Foundation; version 2 of the License;
  */
+
+#include "ackvec.h"
 #include "dccp.h"
+
+#include <linux/dccp.h>
+#include <linux/init.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 #include <linux/slab.h>
 
+#include <net/sock.h>
+
 static struct kmem_cache *dccp_ackvec_slab;
 static struct kmem_cache *dccp_ackvec_record_slab;
 
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
 {
-	struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
+	struct dccp_ackvec_record *avr =
+			kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
 
-	if (av != NULL) {
-		av->av_buf_head	= av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
-		INIT_LIST_HEAD(&av->av_records);
-	}
-	return av;
+	if (avr != NULL)
+		INIT_LIST_HEAD(&avr->avr_node);
+
+	return avr;
 }
 
-static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
+static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
 {
-	struct dccp_ackvec_record *cur, *next;
-
-	list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
-		kmem_cache_free(dccp_ackvec_record_slab, cur);
-	INIT_LIST_HEAD(&av->av_records);
+	if (unlikely(avr == NULL))
+		return;
+	/* Check if deleting a linked record */
+	WARN_ON(!list_empty(&avr->avr_node));
+	kmem_cache_free(dccp_ackvec_record_slab, avr);
 }
 
-void dccp_ackvec_free(struct dccp_ackvec *av)
+static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
+				   struct dccp_ackvec_record *avr)
 {
-	if (likely(av != NULL)) {
-		dccp_ackvec_purge_records(av);
-		kmem_cache_free(dccp_ackvec_slab, av);
+	/*
+	 * AVRs are sorted by seqno. Since we are sending them in order, we
+	 * just add the AVR at the head of the list.
+	 * -sorbo.
+	 */
+	if (!list_empty(&av->av_records)) {
+		const struct dccp_ackvec_record *head =
+					list_entry(av->av_records.next,
+						   struct dccp_ackvec_record,
+						   avr_node);
+		BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
 	}
+
+	list_add(&avr->avr_node, &av->av_records);
 }
 
-/**
- * dccp_ackvec_update_records  -  Record information about sent Ack Vectors
- * @av:		Ack Vector records to update
- * @seqno:	Sequence number of the packet carrying the Ack Vector just sent
- * @nonce_sum:	The sum of all buffer nonces contained in the Ack Vector
- */
-int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
+int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	/* Figure out how many options do we need to represent the ackvec */
+	const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
+	u16 len = av->av_vec_len + 2 * nr_opts, i;
+	u32 elapsed_time;
+	const unsigned char *tail, *from;
+	unsigned char *to;
 	struct dccp_ackvec_record *avr;
+	suseconds_t delta;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+		return -1;
+
+	delta = ktime_us_delta(ktime_get_real(), av->av_time);
+	elapsed_time = delta / 10;
 
-	avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+	if (elapsed_time != 0 &&
+	    dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
+		return -1;
+
+	avr = dccp_ackvec_record_new();
 	if (avr == NULL)
-		return -ENOBUFS;
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to   = skb_push(skb, len);
+	len  = av->av_vec_len;
+	from = av->av_buf + av->av_buf_head;
+	tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
+
+	for (i = 0; i < nr_opts; ++i) {
+		int copylen = len;
+
+		if (len > DCCP_MAX_ACKVEC_OPT_LEN)
+			copylen = DCCP_MAX_ACKVEC_OPT_LEN;
+
+		*to++ = DCCPO_ACK_VECTOR_0;
+		*to++ = copylen + 2;
+
+		/* Check if buf_head wraps */
+		if (from + copylen > tail) {
+			const u16 tailsize = tail - from;
+
+			memcpy(to, from, tailsize);
+			to	+= tailsize;
+			len	-= tailsize;
+			copylen	-= tailsize;
+			from	= av->av_buf;
+		}
+
+		memcpy(to, from, copylen);
+		from += copylen;
+		to   += copylen;
+		len  -= copylen;
+	}
 
-	avr->avr_ack_seqno  = seqno;
-	avr->avr_ack_ptr    = av->av_buf_head;
-	avr->avr_ack_ackno  = av->av_buf_ackno;
-	avr->avr_ack_nonce  = nonce_sum;
-	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
-	/*
-	 * When the buffer overflows, we keep no more than one record. This is
-	 * the simplest way of disambiguating sender-Acks dating from before the
-	 * overflow from sender-Acks which refer to after the overflow; a simple
-	 * solution is preferable here since we are handling an exception.
-	 */
-	if (av->av_overflow)
-		dccp_ackvec_purge_records(av);
 	/*
-	 * Since GSS is incremented for each packet, the list is automatically
-	 * arranged in descending order of @ack_seqno.
+	 *	From RFC 4340, A.2:
+	 *
+	 *	For each acknowledgement it sends, the HC-Receiver will add an
+	 *	acknowledgement record.  ack_seqno will equal the HC-Receiver
+	 *	sequence number it used for the ack packet; ack_ptr will equal
+	 *	buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
+	 *	equal buf_nonce.
 	 */
-	list_add(&avr->avr_node, &av->av_records);
+	avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+	avr->avr_ack_ptr   = av->av_buf_head;
+	avr->avr_ack_ackno = av->av_buf_ackno;
+	avr->avr_ack_nonce = av->av_buf_nonce;
+	avr->avr_sent_len  = av->av_vec_len;
 
-	dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
+	dccp_ackvec_insert_avr(av, avr);
+
+	dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
+		      "ack_ackno=%llu\n",
+		      dccp_role(sk), avr->avr_sent_len,
 		      (unsigned long long)avr->avr_ack_seqno,
-		      (unsigned long long)avr->avr_ack_ackno,
-		      avr->avr_ack_runlen);
+		      (unsigned long long)avr->avr_ack_ackno);
 	return 0;
 }
 
-static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
-						     const u64 ackno)
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
 {
-	struct dccp_ackvec_record *avr;
-	/*
-	 * Exploit that records are inserted in descending order of sequence
-	 * number, start with the oldest record first. If @ackno is `before'
-	 * the earliest ack_ackno, the packet is too old to be considered.
-	 */
-	list_for_each_entry_reverse(avr, av_list, avr_node) {
-		if (avr->avr_ack_seqno == ackno)
-			return avr;
-		if (before48(ackno, avr->avr_ack_seqno))
-			break;
+	struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
+
+	if (av != NULL) {
+		av->av_buf_head	 = DCCP_MAX_ACKVEC_LEN - 1;
+		av->av_buf_ackno = UINT48_MAX + 1;
+		av->av_buf_nonce = 0;
+		av->av_time	 = ktime_set(0, 0);
+		av->av_vec_len	 = 0;
+		INIT_LIST_HEAD(&av->av_records);
 	}
-	return NULL;
+
+	return av;
 }
 
-/*
- * Buffer index and length computation using modulo-buffersize arithmetic.
- * Note that, as pointers move from right to left, head is `before' tail.
- */
-static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
+void dccp_ackvec_free(struct dccp_ackvec *av)
 {
-	return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
+	if (unlikely(av == NULL))
+		return;
+
+	if (!list_empty(&av->av_records)) {
+		struct dccp_ackvec_record *avr, *next;
+
+		list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
+			list_del_init(&avr->avr_node);
+			dccp_ackvec_record_delete(avr);
+		}
+	}
+
+	kmem_cache_free(dccp_ackvec_slab, av);
 }
 
-static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
+static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
+				   const u32 index)
 {
-	return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
+	return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
 }
 
-u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
+static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
+				 const u32 index)
 {
-	if (unlikely(av->av_overflow))
-		return DCCPAV_MAX_ACKVEC_LEN;
-	return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
+	return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
 }
 
-/**
- * dccp_ackvec_update_old  -  Update previous state as per RFC 4340, 11.4.1
- * @av:		non-empty buffer to update
- * @distance:   negative or zero distance of @seqno from buf_ackno downward
- * @seqno:	the (old) sequence number whose record is to be updated
- * @state:	state in which packet carrying @seqno was received
+/*
+ * If several packets are missing, the HC-Receiver may prefer to enter multiple
+ * bytes with run length 0, rather than a single byte with a larger run length;
+ * this simplifies table updates if one of the missing packets arrives.
  */
-static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
-				   u64 seqno, enum dccp_ackvec_states state)
+static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
+						 const unsigned int packets,
+						 const unsigned char state)
 {
-	u16 ptr = av->av_buf_head;
+	unsigned int gap;
+	long new_head;
 
-	BUG_ON(distance > 0);
-	if (unlikely(dccp_ackvec_is_empty(av)))
-		return;
+	if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
+		return -ENOBUFS;
 
-	do {
-		u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
+	gap	 = packets - 1;
+	new_head = av->av_buf_head - packets;
 
-		if (distance + runlen >= 0) {
-			/*
-			 * Only update the state if packet has not been received
-			 * yet. This is OK as per the second table in RFC 4340,
-			 * 11.4.1; i.e. here we are using the following table:
-			 *                     RECEIVED
-			 *                      0   1   3
-			 *              S     +---+---+---+
-			 *              T   0 | 0 | 0 | 0 |
-			 *              O     +---+---+---+
-			 *              R   1 | 1 | 1 | 1 |
-			 *              E     +---+---+---+
-			 *              D   3 | 0 | 1 | 3 |
-			 *                    +---+---+---+
-			 * The "Not Received" state was set by reserve_seats().
-			 */
-			if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
-				av->av_buf[ptr] = state;
-			else
-				dccp_pr_debug("Not changing %llu state to %u\n",
-					      (unsigned long long)seqno, state);
-			break;
+	if (new_head < 0) {
+		if (gap > 0) {
+			memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
+			       gap + new_head + 1);
+			gap = -new_head;
 		}
+		new_head += DCCP_MAX_ACKVEC_LEN;
+	}
 
-		distance += runlen + 1;
-		ptr	  = __ackvec_idx_add(ptr, 1);
+	av->av_buf_head = new_head;
 
-	} while (ptr != av->av_buf_tail);
-}
+	if (gap > 0)
+		memset(av->av_buf + av->av_buf_head + 1,
+		       DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
 
-/* Mark @num entries after buf_head as "Not yet received". */
-static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
-{
-	u16 start = __ackvec_idx_add(av->av_buf_head, 1),
-	    len	  = DCCPAV_MAX_ACKVEC_LEN - start;
-
-	/* check for buffer wrap-around */
-	if (num > len) {
-		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
-		start = 0;
-		num  -= len;
-	}
-	if (num)
-		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
+	av->av_buf[av->av_buf_head] = state;
+	av->av_vec_len += packets;
+	return 0;
 }
 
-/**
- * dccp_ackvec_add_new  -  Record one or more new entries in Ack Vector buffer
- * @av:		 container of buffer to update (can be empty or non-empty)
- * @num_packets: number of packets to register (must be >= 1)
- * @seqno:	 sequence number of the first packet in @num_packets
- * @state:	 state in which packet carrying @seqno was received
+/*
+ * Implements the RFC 4340, Appendix A
  */
-static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
-				u64 seqno, enum dccp_ackvec_states state)
+int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+		    const u64 ackno, const u8 state)
 {
-	u32 num_cells = num_packets;
+	/*
+	 * Check at the right places if the buffer is full, if it is, tell the
+	 * caller to start dropping packets till the HC-Sender acks our ACK
+	 * vectors, when we will free up space in av_buf.
+	 *
+	 * We may well decide to do buffer compression, etc, but for now lets
+	 * just drop.
+	 *
+	 * From Appendix A.1.1 (`New Packets'):
+	 *
+	 *	Of course, the circular buffer may overflow, either when the
+	 *	HC-Sender is sending data at a very high rate, when the
+	 *	HC-Receiver's acknowledgements are not reaching the HC-Sender,
+	 *	or when the HC-Sender is forgetting to acknowledge those acks
+	 *	(so the HC-Receiver is unable to clean up old state). In this
+	 *	case, the HC-Receiver should either compress the buffer (by
+	 *	increasing run lengths when possible), transfer its state to
+	 *	a larger buffer, or, as a last resort, drop all received
+	 *	packets, without processing them whatsoever, until its buffer
+	 *	shrinks again.
+	 */
 
-	if (num_packets > DCCPAV_BURST_THRESH) {
-		u32 lost_packets = num_packets - 1;
+	/* See if this is the first ackno being inserted */
+	if (av->av_vec_len == 0) {
+		av->av_buf[av->av_buf_head] = state;
+		av->av_vec_len = 1;
+	} else if (after48(ackno, av->av_buf_ackno)) {
+		const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
 
-		DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
 		/*
-		 * We received 1 packet and have a loss of size "num_packets-1"
-		 * which we squeeze into num_cells-1 rather than reserving an
-		 * entire byte for each lost packet.
-		 * The reason is that the vector grows in O(burst_length); when
-		 * it grows too large there will no room left for the payload.
-		 * This is a trade-off: if a few packets out of the burst show
-		 * up later, their state will not be changed; it is simply too
-		 * costly to reshuffle/reallocate/copy the buffer each time.
-		 * Should such problems persist, we will need to switch to a
-		 * different underlying data structure.
+		 * Look if the state of this packet is the same as the
+		 * previous ackno and if so if we can bump the head len.
 		 */
-		for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
-			u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
-
-			av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
-			av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+		if (delta == 1 &&
+		    dccp_ackvec_state(av, av->av_buf_head) == state &&
+		    dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
+			av->av_buf[av->av_buf_head]++;
+		else if (dccp_ackvec_set_buf_head_state(av, delta, state))
+			return -ENOBUFS;
+	} else {
+		/*
+		 * A.1.2.  Old Packets
+		 *
+		 *	When a packet with Sequence Number S <= buf_ackno
+		 *	arrives, the HC-Receiver will scan the table for
+		 *	the byte corresponding to S. (Indexing structures
+		 *	could reduce the complexity of this scan.)
+		 */
+		u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
+		u32 index = av->av_buf_head;
 
-			lost_packets -= len;
+		while (1) {
+			const u8 len = dccp_ackvec_len(av, index);
+			const u8 av_state = dccp_ackvec_state(av, index);
+			/*
+			 * valid packets not yet in av_buf have a reserved
+			 * entry, with a len equal to 0.
+			 */
+			if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+			    len == 0 && delta == 0) { /* Found our
+							 reserved seat! */
+				dccp_pr_debug("Found %llu reserved seat!\n",
+					      (unsigned long long)ackno);
+				av->av_buf[index] = state;
+				goto out;
+			}
+			/* len == 0 means one packet */
+			if (delta < len + 1)
+				goto out_duplicate;
+
+			delta -= len + 1;
+			if (++index == DCCP_MAX_ACKVEC_LEN)
+				index = 0;
 		}
 	}
 
-	if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
-		DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
-		av->av_overflow = true;
-	}
-
-	av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
-	if (av->av_overflow)
-		av->av_buf_tail = av->av_buf_head;
-
-	av->av_buf[av->av_buf_head] = state;
-	av->av_buf_ackno	    = seqno;
+	av->av_buf_ackno = ackno;
+	av->av_time = ktime_get_real();
+out:
+	return 0;
 
-	if (num_packets > 1)
-		dccp_ackvec_reserve_seats(av, num_packets - 1);
+out_duplicate:
+	/* Duplicate packet */
+	dccp_pr_debug("Received a dup or already considered lost "
+		      "packet: %llu\n", (unsigned long long)ackno);
+	return -EILSEQ;
 }
 
-/**
- * dccp_ackvec_input  -  Register incoming packet in the buffer
- */
-void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
+static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
+				     struct dccp_ackvec_record *avr)
 {
-	u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
-	enum dccp_ackvec_states state = DCCPAV_RECEIVED;
+	struct dccp_ackvec_record *next;
 
-	if (dccp_ackvec_is_empty(av)) {
-		dccp_ackvec_add_new(av, 1, seqno, state);
-		av->av_tail_ackno = seqno;
+	/* sort out vector length */
+	if (av->av_buf_head <= avr->avr_ack_ptr)
+		av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
+	else
+		av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
+				 av->av_buf_head + avr->avr_ack_ptr;
 
-	} else {
-		s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
-		u8 *current_head = av->av_buf + av->av_buf_head;
-
-		if (num_packets == 1 &&
-		    dccp_ackvec_state(current_head) == state &&
-		    dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
+	/* free records */
+	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+		list_del_init(&avr->avr_node);
+		dccp_ackvec_record_delete(avr);
+	}
+}
 
-			*current_head   += 1;
-			av->av_buf_ackno = seqno;
+void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
+				 const u64 ackno)
+{
+	struct dccp_ackvec_record *avr;
 
-		} else if (num_packets > 0) {
-			dccp_ackvec_add_new(av, num_packets, seqno, state);
-		} else {
-			dccp_ackvec_update_old(av, num_packets, seqno, state);
-		}
+	/*
+	 * If we traverse backwards, it should be faster when we have large
+	 * windows. We will be receiving ACKs for stuff we sent a while back
+	 * -sorbo.
+	 */
+	list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
+		if (ackno == avr->avr_ack_seqno) {
+			dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
+				      "ack_ackno=%llu, ACKED!\n",
+				      dccp_role(sk), 1,
+				      (unsigned long long)avr->avr_ack_seqno,
+				      (unsigned long long)avr->avr_ack_ackno);
+			dccp_ackvec_throw_record(av, avr);
+			break;
+		} else if (avr->avr_ack_seqno > ackno)
+			break; /* old news */
 	}
 }
 
-/**
- * dccp_ackvec_clear_state  -  Perform house-keeping / garbage-collection
- * This routine is called when the peer acknowledges the receipt of Ack Vectors
- * up to and including @ackno. While based on on section A.3 of RFC 4340, here
- * are additional precautions to prevent corrupted buffer state. In particular,
- * we use tail_ackno to identify outdated records; it always marks the earliest
- * packet of group (2) in 11.4.2.
- */
-void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
- {
-	struct dccp_ackvec_record *avr, *next;
-	u8 runlen_now, eff_runlen;
-	s64 delta;
+static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
+					    struct sock *sk, u64 *ackno,
+					    const unsigned char len,
+					    const unsigned char *vector)
+{
+	unsigned char i;
+	struct dccp_ackvec_record *avr;
 
-	avr = dccp_ackvec_lookup(&av->av_records, ackno);
-	if (avr == NULL)
+	/* Check if we actually sent an ACK vector */
+	if (list_empty(&av->av_records))
 		return;
-	/*
-	 * Deal with outdated acknowledgments: this arises when e.g. there are
-	 * several old records and the acks from the peer come in slowly. In
-	 * that case we may still have records that pre-date tail_ackno.
-	 */
-	delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
-	if (delta < 0)
-		goto free_records;
-	/*
-	 * Deal with overlapping Ack Vectors: don't subtract more than the
-	 * number of packets between tail_ackno and ack_ackno.
-	 */
-	eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
 
-	runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
+	i = len;
 	/*
-	 * The run length of Ack Vector cells does not decrease over time. If
-	 * the run length is the same as at the time the Ack Vector was sent, we
-	 * free the ack_ptr cell. That cell can however not be freed if the run
-	 * length has increased: in this case we need to move the tail pointer
-	 * backwards (towards higher indices), to its next-oldest neighbour.
+	 * XXX
+	 * I think it might be more efficient to work backwards. See comment on
+	 * rcv_ackno. -sorbo.
 	 */
-	if (runlen_now > eff_runlen) {
+	avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
+	while (i--) {
+		const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+		u64 ackno_end_rl;
 
-		av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
-		av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+		dccp_set_seqno(&ackno_end_rl, *ackno - rl);
 
-		/* This move may not have cleared the overflow flag. */
-		if (av->av_overflow)
-			av->av_overflow = (av->av_buf_head == av->av_buf_tail);
-	} else {
-		av->av_buf_tail	= avr->avr_ack_ptr;
 		/*
-		 * We have made sure that avr points to a valid cell within the
-		 * buffer. This cell is either older than head, or equals head
-		 * (empty buffer): in both cases we no longer have any overflow.
+		 * If our AVR sequence number is greater than the ack, go
+		 * forward in the AVR list until it is not so.
 		 */
-		av->av_overflow	= 0;
-	}
-
-	/*
-	 * The peer has acknowledged up to and including ack_ackno. Hence the
-	 * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
-	 */
-	av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+		list_for_each_entry_from(avr, &av->av_records, avr_node) {
+			if (!after48(avr->avr_ack_seqno, *ackno))
+				goto found;
+		}
+		/* End of the av_records list, not found, exit */
+		break;
+found:
+		if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
+			const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
+			if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
+				dccp_pr_debug("%s ACK vector 0, len=%d, "
+					      "ack_seqno=%llu, ack_ackno=%llu, "
+					      "ACKED!\n",
+					      dccp_role(sk), len,
+					      (unsigned long long)
+					      avr->avr_ack_seqno,
+					      (unsigned long long)
+					      avr->avr_ack_ackno);
+				dccp_ackvec_throw_record(av, avr);
+				break;
+			}
+			/*
+			 * If it wasn't received, continue scanning... we might
+			 * find another one.
+			 */
+		}
 
-free_records:
-	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
-		list_del(&avr->avr_node);
-		kmem_cache_free(dccp_ackvec_record_slab, avr);
+		dccp_set_seqno(ackno, ackno_end_rl - 1);
+		++vector;
 	}
 }
 
-/*
- *	Routines to keep track of Ack Vectors received in an skb
- */
-int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
+int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+		      u64 *ackno, const u8 opt, const u8 *value, const u8 len)
 {
-	struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
-
-	if (new == NULL)
-		return -ENOBUFS;
-	new->vec   = vec;
-	new->len   = len;
-	new->nonce = nonce;
+	if (len > DCCP_MAX_ACKVEC_OPT_LEN)
+		return -1;
 
-	list_add_tail(&new->node, head);
+	/* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
+	dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
+					ackno, len, value);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
-
-void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
-{
-	struct dccp_ackvec_parsed *cur, *next;
-
-	list_for_each_entry_safe(cur, next, parsed_chunks, node)
-		kfree(cur);
-	INIT_LIST_HEAD(parsed_chunks);
-}
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
 
 int __init dccp_ackvec_init(void)
 {
@@ -379,9 +449,10 @@ int __init dccp_ackvec_init(void)
 	if (dccp_ackvec_slab == NULL)
 		goto out_err;
 
-	dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
-					     sizeof(struct dccp_ackvec_record),
-					     0, SLAB_HWCACHE_ALIGN, NULL);
+	dccp_ackvec_record_slab =
+			kmem_cache_create("dccp_ackvec_record",
+					  sizeof(struct dccp_ackvec_record),
+					  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_record_slab == NULL)
 		goto out_destroy_slab;
 
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 6cdca79a99f..bcb64fb4ace 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,134 +3,156 @@
 /*
  *  net/dccp/ackvec.h
  *
- *  An implementation of Ack Vectors for the DCCP protocol
- *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ *  An implementation of the DCCP protocol
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
  *	This program is free software; you can redistribute it and/or modify it
  *	under the terms of the GNU General Public License version 2 as
  *	published by the Free Software Foundation.
  */
 
-#include <linux/dccp.h>
 #include <linux/compiler.h>
+#include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
-/*
- * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
- * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
- * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
- * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
- * The maximum value is bounded by the u16 types for indices and functions.
- */
-#define DCCPAV_NUM_ACKVECS	2
-#define DCCPAV_MAX_ACKVEC_LEN	(DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
-
-/* Estimated minimum average Ack Vector length - used for updating MPS */
-#define DCCPAV_MIN_OPTLEN	16
-
-/* Threshold for coping with large bursts of losses */
-#define DCCPAV_BURST_THRESH	(DCCPAV_MAX_ACKVEC_LEN / 8)
-
-enum dccp_ackvec_states {
-	DCCPAV_RECEIVED =	0x00,
-	DCCPAV_ECN_MARKED =	0x40,
-	DCCPAV_RESERVED =	0x80,
-	DCCPAV_NOT_RECEIVED =	0xC0
-};
-#define DCCPAV_MAX_RUNLEN	0x3F
+/* Read about the ECN nonce to see why it is 253 */
+#define DCCP_MAX_ACKVEC_OPT_LEN 253
+/* We can spread an ack vector across multiple options */
+#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
 
-static inline u8 dccp_ackvec_runlen(const u8 *cell)
-{
-	return *cell & DCCPAV_MAX_RUNLEN;
-}
+#define DCCP_ACKVEC_STATE_RECEIVED	0
+#define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
+#define DCCP_ACKVEC_STATE_NOT_RECEIVED	(3 << 6)
 
-static inline u8 dccp_ackvec_state(const u8 *cell)
-{
-	return *cell & ~DCCPAV_MAX_RUNLEN;
-}
+#define DCCP_ACKVEC_STATE_MASK		0xC0 /* 11000000 */
+#define DCCP_ACKVEC_LEN_MASK		0x3F /* 00111111 */
 
-/** struct dccp_ackvec - Ack Vector main data structure
+/** struct dccp_ackvec - ack vector
+ *
+ * This data structure is the one defined in RFC 4340, Appendix A.
  *
- * This implements a fixed-size circular buffer within an array and is largely
- * based on Appendix A of RFC 4340.
+ * @av_buf_head - circular buffer head
+ * @av_buf_tail - circular buffer tail
+ * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
+ *		       buffer (i.e. %av_buf_head)
+ * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
+ * 		       by the buffer with State 0
  *
- * @av_buf:	   circular buffer storage area
- * @av_buf_head:   head index; begin of live portion in @av_buf
- * @av_buf_tail:   tail index; first index _after_ the live portion in @av_buf
- * @av_buf_ackno:  highest seqno of acknowledgeable packet recorded in @av_buf
- * @av_tail_ackno: lowest  seqno of acknowledgeable packet recorded in @av_buf
- * @av_buf_nonce:  ECN nonce sums, each covering subsequent segments of up to
- *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
- * @av_overflow:   if 1 then buf_head == buf_tail indicates buffer wraparound
- * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
+ * Additionally, the HC-Receiver must keep some information about the
+ * Ack Vectors it has recently sent. For each packet sent carrying an
+ * Ack Vector, it remembers four variables:
+ *
+ * @av_records - list of dccp_ackvec_record
+ * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ *
+ * @av_time - the time in usecs
+ * @av_buf - circular buffer of acknowledgeable packets
  */
 struct dccp_ackvec {
-	u8			av_buf[DCCPAV_MAX_ACKVEC_LEN];
-	u16			av_buf_head;
-	u16			av_buf_tail;
-	u64			av_buf_ackno:48;
-	u64			av_tail_ackno:48;
-	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
-	u8			av_overflow:1;
+	u64			av_buf_ackno;
 	struct list_head	av_records;
+	ktime_t			av_time;
+	u16			av_buf_head;
+	u16			av_vec_len;
+	u8			av_buf_nonce;
+	u8			av_ack_nonce;
+	u8			av_buf[DCCP_MAX_ACKVEC_LEN];
 };
 
-/** struct dccp_ackvec_record - Records information about sent Ack Vectors
+/** struct dccp_ackvec_record - ack vector record
  *
- * These list entries define the additional information which the HC-Receiver
- * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
+ * ACK vector record as defined in Appendix A of spec.
  *
- * @avr_node:	    the list node in @av_records
- * @avr_ack_seqno:  sequence number of the packet the Ack Vector was sent on
- * @avr_ack_ackno:  the Ack number that this record/Ack Vector refers to
- * @avr_ack_ptr:    pointer into @av_buf where this record starts
- * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
- * @avr_ack_nonce:  the sum of @av_buf_nonce's at the time this record was sent
+ * The list is sorted by avr_ack_seqno
  *
- * The list as a whole is sorted in descending order by @avr_ack_seqno.
+ * @avr_node - node in av_records
+ * @avr_ack_seqno - sequence number of the packet this record was sent on
+ * @avr_ack_ackno - sequence number being acknowledged
+ * @avr_ack_ptr - pointer into av_buf where this record starts
+ * @avr_ack_nonce - av_ack_nonce at the time this record was sent
+ * @avr_sent_len - lenght of the record in av_buf
  */
 struct dccp_ackvec_record {
 	struct list_head avr_node;
-	u64		 avr_ack_seqno:48;
-	u64		 avr_ack_ackno:48;
+	u64		 avr_ack_seqno;
+	u64		 avr_ack_ackno;
 	u16		 avr_ack_ptr;
-	u8		 avr_ack_runlen;
-	u8		 avr_ack_nonce:1;
+	u16		 avr_sent_len;
+	u8		 avr_ack_nonce;
 };
 
-extern int  dccp_ackvec_init(void);
+struct sock;
+struct sk_buff;
+
+#ifdef CONFIG_IP_DCCP_ACKVEC
+extern int dccp_ackvec_init(void);
 extern void dccp_ackvec_exit(void);
 
 extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
 extern void dccp_ackvec_free(struct dccp_ackvec *av);
 
-extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
-extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
-extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
-extern u16  dccp_ackvec_buflen(const struct dccp_ackvec *av);
+extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+			   const u64 ackno, const u8 state);
+
+extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
+					struct sock *sk, const u64 ackno);
+extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+			     u64 *ackno, const u8 opt,
+			     const u8 *value, const u8 len);
 
-static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
+extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
+
+static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+{
+	return av->av_vec_len;
+}
+#else /* CONFIG_IP_DCCP_ACKVEC */
+static inline int dccp_ackvec_init(void)
 {
-	return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
+	return 0;
 }
 
-/**
- * struct dccp_ackvec_parsed  -  Record offsets of Ack Vectors in skb
- * @vec:	start of vector (offset into skb)
- * @len:	length of @vec
- * @nonce:	whether @vec had an ECN nonce of 0 or 1
- * @node:	FIFO - arranged in descending order of ack_ackno
- * This structure is used by CCIDs to access Ack Vectors in a received skb.
- */
-struct dccp_ackvec_parsed {
-	u8		 *vec,
-			 len,
-			 nonce:1;
-	struct list_head node;
-};
+static inline void dccp_ackvec_exit(void)
+{
+}
+
+static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+{
+	return NULL;
+}
+
+static inline void dccp_ackvec_free(struct dccp_ackvec *av)
+{
+}
+
+static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+				  const u64 ackno, const u8 state)
+{
+	return -1;
+}
 
-extern int dccp_ackvec_parsed_add(struct list_head *head,
-				  u8 *vec, u8 len, u8 nonce);
-extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
+static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
+					       struct sock *sk, const u64 ackno)
+{
+}
+
+static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+				    const u64 *ackno, const u8 opt,
+				    const u8 *value, const u8 len)
+{
+	return -1;
+}
+
+static inline int dccp_insert_option_ackvec(const struct sock *sk,
+					    const struct sk_buff *skb)
+{
+	return -1;
+}
+
+static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+{
+	return 0;
+}
+#endif /* CONFIG_IP_DCCP_ACKVEC */
 #endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index e3fb52b4f5c..4809753d12a 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -13,13 +13,6 @@
 
 #include "ccid.h"
 
-static u8 builtin_ccids[] = {
-	DCCPC_CCID2,		/* CCID2 is supported by default */
-#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
-	DCCPC_CCID3,
-#endif
-};
-
 static struct ccid_operations *ccids[CCID_MAX];
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 static atomic_t ccids_lockct = ATOMIC_INIT(0);
@@ -93,47 +86,6 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
 	}
 }
 
-/* check that up to @array_len members in @ccid_array are supported */
-bool ccid_support_check(u8 const *ccid_array, u8 array_len)
-{
-	u8 i, j, found;
-
-	for (i = 0, found = 0; i < array_len; i++, found = 0) {
-		for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++)
-			found = (ccid_array[i] == builtin_ccids[j]);
-		if (!found)
-			return false;
-	}
-	return true;
-}
-
-/**
- * ccid_get_builtin_ccids  -  Provide copy of `builtin' CCID array
- * @ccid_array: pointer to copy into
- * @array_len: value to return length into
- * This function allocates memory - caller must see that it is freed after use.
- */
-int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
-{
-	*ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any());
-	if (*ccid_array == NULL)
-		return -ENOBUFS;
-	*array_len = ARRAY_SIZE(builtin_ccids);
-	return 0;
-}
-
-int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
-				    char __user *optval, int __user *optlen)
-{
-	if (len < sizeof(builtin_ccids))
-		return -EINVAL;
-
-	if (put_user(sizeof(builtin_ccids), optlen) ||
-	    copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids)))
-		return -EFAULT;
-	return 0;
-}
-
 int ccid_register(struct ccid_operations *ccid_ops)
 {
 	int err = -ENOBUFS;
@@ -196,41 +148,22 @@ int ccid_unregister(struct ccid_operations *ccid_ops)
 
 EXPORT_SYMBOL_GPL(ccid_unregister);
 
-/**
- * ccid_request_module  -  Pre-load CCID module for later use
- * This should be called only from process context (e.g. during connection
- * setup) and is necessary for later calls to ccid_new (typically in software
- * interrupt), so that it has the modules available when they are needed.
- */
-static int ccid_request_module(u8 id)
-{
-	if (!in_atomic()) {
-		ccids_read_lock();
-		if (ccids[id] == NULL) {
-			ccids_read_unlock();
-			return request_module("net-dccp-ccid-%d", id);
-		}
-		ccids_read_unlock();
-	}
-	return 0;
-}
-
-int ccid_request_modules(u8 const *ccid_array, u8 array_len)
-{
-#ifdef CONFIG_KMOD
-	while (array_len--)
-		if (ccid_request_module(ccid_array[array_len]))
-			return -1;
-#endif
-	return 0;
-}
-
 struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
 {
 	struct ccid_operations *ccid_ops;
 	struct ccid *ccid = NULL;
 
 	ccids_read_lock();
+#ifdef CONFIG_KMOD
+	if (ccids[id] == NULL) {
+		/* We only try to load if in process context */
+		ccids_read_unlock();
+		if (gfp & GFP_ATOMIC)
+			goto out;
+		request_module("net-dccp-ccid-%d", id);
+		ccids_read_lock();
+	}
+#endif
 	ccid_ops = ccids[id];
 	if (ccid_ops == NULL)
 		goto out_unlock;
@@ -272,6 +205,20 @@ out_module_put:
 
 EXPORT_SYMBOL_GPL(ccid_new);
 
+struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
+{
+	return ccid_new(id, sk, 1, gfp);
+}
+
+EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
+
+struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk,  gfp_t gfp)
+{
+	return ccid_new(id, sk, 0, gfp);
+}
+
+EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
+
 static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
 {
 	struct ccid_operations *ccid_ops;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index d27054ba215..fdeae7b5731 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -60,18 +60,22 @@ struct ccid_operations {
 	void		(*ccid_hc_tx_exit)(struct sock *sk);
 	void		(*ccid_hc_rx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
-						    u8 opt, u8 *val, u8 len);
+	int		(*ccid_hc_rx_parse_options)(struct sock *sk,
+						    unsigned char option,
+						    unsigned char len, u16 idx,
+						    unsigned char* value);
 	int		(*ccid_hc_rx_insert_options)(struct sock *sk,
 						     struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
-						    u8 opt, u8 *val, u8 len);
+	int		(*ccid_hc_tx_parse_options)(struct sock *sk,
+						    unsigned char option,
+						    unsigned char len, u16 idx,
+						    unsigned char* value);
 	int		(*ccid_hc_tx_send_packet)(struct sock *sk,
 						  struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_sent)(struct sock *sk,
-						  unsigned int len);
+						  int more, unsigned int len);
 	void		(*ccid_hc_rx_get_info)(struct sock *sk,
 					       struct tcp_info *info);
 	void		(*ccid_hc_tx_get_info)(struct sock *sk,
@@ -99,78 +103,31 @@ static inline void *ccid_priv(const struct ccid *ccid)
 	return (void *)ccid->ccid_priv;
 }
 
-extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
-extern int  ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
-extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
-					  char __user *, int __user *);
-
-extern int    ccid_request_modules(u8 const *ccid_array, u8 array_len);
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
-static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
-{
-	struct ccid *ccid = dp->dccps_hc_rx_ccid;
-
-	if (ccid == NULL || ccid->ccid_ops == NULL)
-		return -1;
-	return ccid->ccid_ops->ccid_id;
-}
-
-static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
-{
-	struct ccid *ccid = dp->dccps_hc_tx_ccid;
-
-	if (ccid == NULL || ccid->ccid_ops == NULL)
-		return -1;
-	return ccid->ccid_ops->ccid_id;
-}
+extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
+				   gfp_t gfp);
+extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
+				   gfp_t gfp);
 
 extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
 extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
 
-/*
- * Congestion control of queued data packets via CCID decision.
- *
- * The TX CCID performs its congestion-control by indicating whether and when a
- * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
- * The following modes are supported via the symbolic constants below:
- * - timer-based pacing    (CCID returns a delay value in milliseconds);
- * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
- */
-
-enum ccid_dequeueing_decision {
-	CCID_PACKET_SEND_AT_ONCE =	 0x00000,  /* "green light": no delay */
-	CCID_PACKET_DELAY_MAX =		 0x0FFFF,  /* maximum delay in msecs  */
-	CCID_PACKET_DELAY =		 0x10000,  /* CCID msec-delay mode */
-	CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000,  /* CCID autonomous mode */
-	CCID_PACKET_ERR =		 0xF0000,  /* error condition */
-};
-
-static inline int ccid_packet_dequeue_eval(const int return_code)
-{
-	if (return_code < 0)
-		return CCID_PACKET_ERR;
-	if (return_code == 0)
-		return CCID_PACKET_SEND_AT_ONCE;
-	if (return_code <= CCID_PACKET_DELAY_MAX)
-		return CCID_PACKET_DELAY;
-	return return_code;
-}
-
 static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
 					 struct sk_buff *skb)
 {
+	int rc = 0;
 	if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
-		return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
-	return CCID_PACKET_SEND_AT_ONCE;
+		rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+	return rc;
 }
 
 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
-					  unsigned int len)
+					  int more, unsigned int len)
 {
 	if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
-		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
+		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
 }
 
 static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -187,31 +144,27 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
 		ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
 }
 
-/**
- * ccid_hc_tx_parse_options  -  Parse CCID-specific options sent by the receiver
- * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
- * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
- * @val: value of @opt
- * @len: length of @val in bytes
- */
 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
-					   u8 pkt, u8 opt, u8 *val, u8 len)
+					   unsigned char option,
+					   unsigned char len, u16 idx,
+					   unsigned char* value)
 {
-	if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
-		return 0;
-	return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
+	int rc = 0;
+	if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
+		rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
+						    value);
+	return rc;
 }
 
-/**
- * ccid_hc_rx_parse_options  -  Parse CCID-specific options sent by the sender
- * Arguments are analogous to ccid_hc_tx_parse_options()
- */
 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
-					   u8 pkt, u8 opt, u8 *val, u8 len)
+					   unsigned char option,
+					   unsigned char len, u16 idx,
+					   unsigned char* value)
 {
-	if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
-		return 0;
-	return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
+	int rc = 0;
+	if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
+		rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
+	return rc;
 }
 
 static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index fb168be2cb4..12275943eab 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,8 +1,10 @@
 menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
 
 config IP_DCCP_CCID2
-	tristate "CCID2 (TCP-Like)"
+	tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
 	def_tristate IP_DCCP
+	select IP_DCCP_ACKVEC
 	---help---
 	  CCID 2, TCP-like Congestion Control, denotes Additive Increase,
 	  Multiplicative Decrease (AIMD) congestion control with behavior
@@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG
 	    If in doubt, say N.
 
 config IP_DCCP_CCID3
-	tristate "CCID3 (TCP-Friendly)"
+	tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
 	def_tristate IP_DCCP
 	select IP_DCCP_TFRC_LIB
 	---help---
@@ -62,9 +64,9 @@ config IP_DCCP_CCID3
 
 	  If in doubt, say M.
 
-if IP_DCCP_CCID3
 config IP_DCCP_CCID3_DEBUG
 	  bool "CCID3 debugging messages"
+	  depends on IP_DCCP_CCID3
 	  ---help---
 	    Enable CCID3-specific debugging messages.
 
@@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG
 
 	    If in doubt, say N.
 
-choice
-	  prompt "Select method for measuring the packet size s"
-	  default IP_DCCP_CCID3_MEASURE_S_AS_MPS
-
-config IP_DCCP_CCID3_MEASURE_S_AS_MPS
-	  bool "Always use MPS in place of s"
-	  ---help---
-	    This use is recommended as it is consistent with the initialisation
-	    of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
-config IP_DCCP_CCID3_MEASURE_S_AS_AVG
-	  bool "Use moving average"
-	  ---help---
-	    An alternative way of tracking s, also supported by rfc3448bis.
-	    This used to be the default for CCID-3 in previous kernels.
-config IP_DCCP_CCID3_MEASURE_S_AS_MAX
-	  bool "Track the maximum payload length"
-	  ---help---
-	    An experimental method based on tracking the maximum packet size.
-endchoice
-
 config IP_DCCP_CCID3_RTO
 	  int "Use higher bound for nofeedback timer"
 	  default 100
+	  depends on IP_DCCP_CCID3 && EXPERIMENTAL
 	  ---help---
 	    Use higher lower bound for nofeedback timer expiration.
 
@@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO
 	    The purpose of the nofeedback timer is to slow DCCP down when there
 	    is serious network congestion: experimenting with larger values should
 	    therefore not be performed on WANs.
-endif	# IP_DCCP_CCID3
 
 config IP_DCCP_TFRC_LIB
 	tristate
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index fa713227c66..9a430734530 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,7 +25,7 @@
 /*
  * This implementation should follow RFC 4341
  */
-#include "../feat.h"
+
 #include "../ccid.h"
 #include "../dccp.h"
 #include "ccid2.h"
@@ -34,8 +34,51 @@
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 static int ccid2_debug;
 #define ccid2_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid2_debug, format, ##a)
+
+static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
+{
+	int len = 0;
+	int pipe = 0;
+	struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
+
+	/* there is data in the chain */
+	if (seqp != hctx->ccid2hctx_seqt) {
+		seqp = seqp->ccid2s_prev;
+		len++;
+		if (!seqp->ccid2s_acked)
+			pipe++;
+
+		while (seqp != hctx->ccid2hctx_seqt) {
+			struct ccid2_seq *prev = seqp->ccid2s_prev;
+
+			len++;
+			if (!prev->ccid2s_acked)
+				pipe++;
+
+			/* packets are sent sequentially */
+			BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
+						prev->ccid2s_seq ) >= 0);
+			BUG_ON(time_before(seqp->ccid2s_sent,
+					   prev->ccid2s_sent));
+
+			seqp = prev;
+		}
+	}
+
+	BUG_ON(pipe != hctx->ccid2hctx_pipe);
+	ccid2_pr_debug("len of chain=%d\n", len);
+
+	do {
+		seqp = seqp->ccid2s_prev;
+		len++;
+	} while (seqp != hctx->ccid2hctx_seqh);
+
+	ccid2_pr_debug("total len=%d\n", len);
+	BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
+}
 #else
 #define ccid2_pr_debug(format, a...)
+#define ccid2_hc_tx_check_sanity(hctx)
 #endif
 
 static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
 	int i;
 
 	/* check if we have space to preserve the pointer to the buffer */
-	if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *))
+	if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) /
+					sizeof(struct ccid2_seq*)))
 		return -ENOMEM;
 
 	/* allocate buffer and initialize linked list */
@@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
 	seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
 
 	/* This is the first allocation.  Initiate the head and tail.  */
-	if (hctx->seqbufc == 0)
-		hctx->seqh = hctx->seqt = seqp;
+	if (hctx->ccid2hctx_seqbufc == 0)
+		hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp;
 	else {
 		/* link the existing list with the one we just created */
-		hctx->seqh->ccid2s_next = seqp;
-		seqp->ccid2s_prev = hctx->seqh;
+		hctx->ccid2hctx_seqh->ccid2s_next = seqp;
+		seqp->ccid2s_prev = hctx->ccid2hctx_seqh;
 
-		hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
-		seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt;
+		hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+		seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt;
 	}
 
 	/* store the original pointer to the buffer so we can free it */
-	hctx->seqbuf[hctx->seqbufc] = seqp;
-	hctx->seqbufc++;
+	hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp;
+	hctx->ccid2hctx_seqbufc++;
 
 	return 0;
 }
 
 static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
-	if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
-		return CCID_PACKET_WILL_DEQUEUE_LATER;
-	return CCID_PACKET_SEND_AT_ONCE;
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd)
+		return 0;
+
+	return 1; /* XXX CCID should dequeue when ready instead of polling */
 }
 
 static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2);
+	u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2);
 
 	/*
 	 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 		DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
 		val = max_ratio;
 	}
-	if (val > DCCPF_ACK_RATIO_MAX)
-		val = DCCPF_ACK_RATIO_MAX;
+	if (val > 0xFFFF)		/* RFC 4340, 11.3 */
+		val = 0xFFFF;
 
 	if (val == dp->dccps_l_ack_ratio)
 		return;
@@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 	dp->dccps_l_ack_ratio = val;
 }
 
+static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
+{
+	ccid2_pr_debug("change SRTT to %ld\n", val);
+	hctx->ccid2hctx_srtt = val;
+}
+
+static void ccid2_start_rto_timer(struct sock *sk);
+
 static void ccid2_hc_tx_rto_expire(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-	const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
+	long s;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
-		sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5);
+		sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
+			       jiffies + HZ / 5);
 		goto out;
 	}
 
 	ccid2_pr_debug("RTO_EXPIRE\n");
 
+	ccid2_hc_tx_check_sanity(hctx);
+
 	/* back-off timer */
-	hctx->rto <<= 1;
-	if (hctx->rto > DCCP_RTO_MAX)
-		hctx->rto = DCCP_RTO_MAX;
+	hctx->ccid2hctx_rto <<= 1;
+
+	s = hctx->ccid2hctx_rto / HZ;
+	if (s > 60)
+		hctx->ccid2hctx_rto = 60 * HZ;
+
+	ccid2_start_rto_timer(sk);
 
 	/* adjust pipe, cwnd etc */
-	hctx->ssthresh = hctx->cwnd / 2;
-	if (hctx->ssthresh < 2)
-		hctx->ssthresh = 2;
-	hctx->cwnd = 1;
-	hctx->pipe = 0;
+	hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2;
+	if (hctx->ccid2hctx_ssthresh < 2)
+		hctx->ccid2hctx_ssthresh = 2;
+	hctx->ccid2hctx_cwnd	 = 1;
+	hctx->ccid2hctx_pipe	 = 0;
 
 	/* clear state about stuff we sent */
-	hctx->seqt = hctx->seqh;
-	hctx->packets_acked = 0;
+	hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
+	hctx->ccid2hctx_packets_acked = 0;
 
 	/* clear ack ratio state. */
-	hctx->rpseq    = 0;
-	hctx->rpdupack = -1;
+	hctx->ccid2hctx_rpseq	 = 0;
+	hctx->ccid2hctx_rpdupack = -1;
 	ccid2_change_l_ack_ratio(sk, 1);
-
-	/* if we were blocked before, we may now send cwnd=1 packet */
-	if (sender_was_blocked)
-		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
-	/* restart backed-off timer */
-	sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
+	ccid2_hc_tx_check_sanity(hctx);
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
 
-static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+static void ccid2_start_rto_timer(struct sock *sk)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
+
+	BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
+	sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
+		       jiffies + hctx->ccid2hctx_rto);
+}
+
+static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 	struct ccid2_seq *next;
 
-	hctx->pipe++;
+	hctx->ccid2hctx_pipe++;
 
-	hctx->seqh->ccid2s_seq   = dp->dccps_gss;
-	hctx->seqh->ccid2s_acked = 0;
-	hctx->seqh->ccid2s_sent  = jiffies;
+	hctx->ccid2hctx_seqh->ccid2s_seq   = dp->dccps_gss;
+	hctx->ccid2hctx_seqh->ccid2s_acked = 0;
+	hctx->ccid2hctx_seqh->ccid2s_sent  = jiffies;
 
-	next = hctx->seqh->ccid2s_next;
+	next = hctx->ccid2hctx_seqh->ccid2s_next;
 	/* check if we need to alloc more space */
-	if (next == hctx->seqt) {
+	if (next == hctx->ccid2hctx_seqt) {
 		if (ccid2_hc_tx_alloc_seq(hctx)) {
 			DCCP_CRIT("packet history - out of memory!");
 			/* FIXME: find a more graceful way to bail out */
 			return;
 		}
-		next = hctx->seqh->ccid2s_next;
-		BUG_ON(next == hctx->seqt);
+		next = hctx->ccid2hctx_seqh->ccid2s_next;
+		BUG_ON(next == hctx->ccid2hctx_seqt);
 	}
-	hctx->seqh = next;
+	hctx->ccid2hctx_seqh = next;
 
-	ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe);
+	ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
+		       hctx->ccid2hctx_pipe);
 
 	/*
 	 * FIXME: The code below is broken and the variables have been removed
@@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 	 */
 #if 0
 	/* Ack Ratio.  Need to maintain a concept of how many windows we sent */
-	hctx->arsent++;
+	hctx->ccid2hctx_arsent++;
 	/* We had an ack loss in this window... */
-	if (hctx->ackloss) {
-		if (hctx->arsent >= hctx->cwnd) {
-			hctx->arsent  = 0;
-			hctx->ackloss = 0;
+	if (hctx->ccid2hctx_ackloss) {
+		if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
+			hctx->ccid2hctx_arsent	= 0;
+			hctx->ccid2hctx_ackloss	= 0;
 		}
 	} else {
 		/* No acks lost up to now... */
@@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 			int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
 				    dp->dccps_l_ack_ratio;
 
-			denom = hctx->cwnd * hctx->cwnd / denom;
+			denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
 
-			if (hctx->arsent >= denom) {
+			if (hctx->ccid2hctx_arsent >= denom) {
 				ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
-				hctx->arsent = 0;
+				hctx->ccid2hctx_arsent = 0;
 			}
 		} else {
 			/* we can't increase ack ratio further [1] */
-			hctx->arsent = 0; /* or maybe set it to cwnd*/
+			hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
 		}
 	}
 #endif
 
 	/* setup RTO timer */
-	if (!timer_pending(&hctx->rtotimer))
-		sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
+	if (!timer_pending(&hctx->ccid2hctx_rtotimer))
+		ccid2_start_rto_timer(sk);
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 	do {
-		struct ccid2_seq *seqp = hctx->seqt;
+		struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
 
-		while (seqp != hctx->seqh) {
+		while (seqp != hctx->ccid2hctx_seqh) {
 			ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
 				       (unsigned long long)seqp->ccid2s_seq,
 				       seqp->ccid2s_acked, seqp->ccid2s_sent);
@@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 		}
 	} while (0);
 	ccid2_pr_debug("=========\n");
+	ccid2_hc_tx_check_sanity(hctx);
 #endif
 }
 
-/**
- * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
- * This code is almost identical with TCP's tcp_rtt_estimator(), since
- * - it has a higher sampling frequency (recommended by RFC 1323),
- * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
- * - it is simple (cf. more complex proposals such as Eifel timer or research
- *   which suggests that the gain should be set according to window size),
- * - in tests it was found to work well with CCID2 [gerrit].
+/* XXX Lame code duplication!
+ * returns -1 if none was found.
+ * else returns the next offset to use in the function call.
  */
-static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
+static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
+			   unsigned char **vec, unsigned char *veclen)
 {
-	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-	long m = mrtt ? : 1;
-
-	if (hctx->srtt == 0) {
-		/* First measurement m */
-		hctx->srtt = m << 3;
-		hctx->mdev = m << 1;
-
-		hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev);
-		hctx->rttvar   = hctx->mdev_max;
-		hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
-	} else {
-		/* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
-		m -= (hctx->srtt >> 3);
-		hctx->srtt += m;
-
-		/* Similarly, update scaled mdev with regard to |m| */
-		if (m < 0) {
-			m = -m;
-			m -= (hctx->mdev >> 2);
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+	unsigned char *opt_ptr;
+	const unsigned char *opt_end = (unsigned char *)dh +
+					(dh->dccph_doff * 4);
+	unsigned char opt, len;
+	unsigned char *value;
+
+	BUG_ON(offset < 0);
+	options += offset;
+	opt_ptr = options;
+	if (opt_ptr >= opt_end)
+		return -1;
+
+	while (opt_ptr != opt_end) {
+		opt   = *opt_ptr++;
+		len   = 0;
+		value = NULL;
+
+		/* Check if this isn't a single byte option */
+		if (opt > DCCPO_MAX_RESERVED) {
+			if (opt_ptr == opt_end)
+				goto out_invalid_option;
+
+			len = *opt_ptr++;
+			if (len < 3)
+				goto out_invalid_option;
 			/*
-			 * This neutralises RTO increase when RTT < SRTT - mdev
-			 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
-			 * in Linux TCP", USENIX 2002, pp. 49-62).
+			 * Remove the type and len fields, leaving
+			 * just the value size
 			 */
-			if (m > 0)
-				m >>= 3;
-		} else {
-			m -= (hctx->mdev >> 2);
-		}
-		hctx->mdev += m;
+			len     -= 2;
+			value   = opt_ptr;
+			opt_ptr += len;
 
-		if (hctx->mdev > hctx->mdev_max) {
-			hctx->mdev_max = hctx->mdev;
-			if (hctx->mdev_max > hctx->rttvar)
-				hctx->rttvar = hctx->mdev_max;
+			if (opt_ptr > opt_end)
+				goto out_invalid_option;
 		}
 
-		/*
-		 * Decay RTTVAR at most once per flight, exploiting that
-		 *  1) pipe <= cwnd <= Sequence_Window = W  (RFC 4340, 7.5.2)
-		 *  2) AWL = GSS-W+1 <= GAR <= GSS          (RFC 4340, 7.5.1)
-		 * GAR is a useful bound for FlightSize = pipe, AWL is probably
-		 * too low as it over-estimates pipe.
-		 */
-		if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
-			if (hctx->mdev_max < hctx->rttvar)
-				hctx->rttvar -= (hctx->rttvar -
-						 hctx->mdev_max) >> 2;
-			hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
-			hctx->mdev_max = TCP_RTO_MIN;
+		switch (opt) {
+		case DCCPO_ACK_VECTOR_0:
+		case DCCPO_ACK_VECTOR_1:
+			*vec	= value;
+			*veclen = len;
+			return offset + (opt_ptr - options);
 		}
 	}
 
-	/*
-	 * Set RTO from SRTT and RTTVAR
-	 * Clock granularity is ignored since the minimum error for RTTVAR is
-	 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
-	 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
-	 * does not retransmit data, DCCP does not require TCP's recommended
-	 * minimum timeout of one second".
-	 */
-	hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
+	return -1;
 
-	if (hctx->rto > DCCP_RTO_MAX)
-		hctx->rto = DCCP_RTO_MAX;
+out_invalid_option:
+	DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
+	return -1;
 }
 
-static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
-			  unsigned int *maxincr)
+static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (hctx->cwnd < hctx->ssthresh) {
-		if (*maxincr > 0 && ++hctx->packets_acked == 2) {
-			hctx->cwnd += 1;
-			*maxincr   -= 1;
-			hctx->packets_acked = 0;
-		}
-	} else if (++hctx->packets_acked >= hctx->cwnd) {
-			hctx->cwnd += 1;
-			hctx->packets_acked = 0;
-	}
-	/*
-	 * FIXME: RTT is sampled several times per acknowledgment (for each
-	 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
-	 * This causes the RTT to be over-estimated, since the older entries
-	 * in the Ack Vector have earlier sending times.
-	 * The cleanest solution is to not use the ccid2s_sent field at all
-	 * and instead use DCCP timestamps - need to be resolved at some time.
-	 */
-	ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
+	sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer);
+	ccid2_pr_debug("deleted RTO timer\n");
 }
 
-static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
+static inline void ccid2_new_ack(struct sock *sk,
+				 struct ccid2_seq *seqp,
+				 unsigned int *maxincr)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (time_before(seqp->ccid2s_sent, hctx->last_cong)) {
-		ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
-		return;
+	if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
+		if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) {
+			hctx->ccid2hctx_cwnd += 1;
+			*maxincr	     -= 1;
+			hctx->ccid2hctx_packets_acked = 0;
+		}
+	} else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) {
+			hctx->ccid2hctx_cwnd += 1;
+			hctx->ccid2hctx_packets_acked = 0;
 	}
 
-	hctx->last_cong = jiffies;
+	/* update RTO */
+	if (hctx->ccid2hctx_srtt == -1 ||
+	    time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) {
+		unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
+		int s;
+
+		/* first measurement */
+		if (hctx->ccid2hctx_srtt == -1) {
+			ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
+				       r, jiffies,
+				       (unsigned long long)seqp->ccid2s_seq);
+			ccid2_change_srtt(hctx, r);
+			hctx->ccid2hctx_rttvar = r >> 1;
+		} else {
+			/* RTTVAR */
+			long tmp = hctx->ccid2hctx_srtt - r;
+			long srtt;
+
+			if (tmp < 0)
+				tmp *= -1;
+
+			tmp >>= 2;
+			hctx->ccid2hctx_rttvar *= 3;
+			hctx->ccid2hctx_rttvar >>= 2;
+			hctx->ccid2hctx_rttvar += tmp;
+
+			/* SRTT */
+			srtt = hctx->ccid2hctx_srtt;
+			srtt *= 7;
+			srtt >>= 3;
+			tmp = r >> 3;
+			srtt += tmp;
+			ccid2_change_srtt(hctx, srtt);
+		}
+		s = hctx->ccid2hctx_rttvar << 2;
+		/* clock granularity is 1 when based on jiffies */
+		if (!s)
+			s = 1;
+		hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
+
+		/* must be at least a second */
+		s = hctx->ccid2hctx_rto / HZ;
+		/* DCCP doesn't require this [but I like it cuz my code sux] */
+#if 1
+		if (s < 1)
+			hctx->ccid2hctx_rto = HZ;
+#endif
+		/* max 60 seconds */
+		if (s > 60)
+			hctx->ccid2hctx_rto = HZ * 60;
 
-	hctx->cwnd     = hctx->cwnd / 2 ? : 1U;
-	hctx->ssthresh = max(hctx->cwnd, 2U);
+		hctx->ccid2hctx_lastrtt = jiffies;
 
-	/* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
-	if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd)
-		ccid2_change_l_ack_ratio(sk, hctx->cwnd);
+		ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
+			       hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
+			       hctx->ccid2hctx_rto, HZ, r);
+	}
+
+	/* we got a new ack, so re-start RTO timer */
+	ccid2_hc_tx_kill_rto_timer(sk);
+	ccid2_start_rto_timer(sk);
 }
 
-static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
-				     u8 option, u8 *optval, u8 optlen)
+static void ccid2_hc_tx_dec_pipe(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	switch (option) {
-	case DCCPO_ACK_VECTOR_0:
-	case DCCPO_ACK_VECTOR_1:
-		return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen,
-					      option - DCCPO_ACK_VECTOR_0);
+	if (hctx->ccid2hctx_pipe == 0)
+		DCCP_BUG("pipe == 0");
+	else
+		hctx->ccid2hctx_pipe--;
+
+	if (hctx->ccid2hctx_pipe == 0)
+		ccid2_hc_tx_kill_rto_timer(sk);
+}
+
+static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
+		ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
+		return;
 	}
-	return 0;
+
+	hctx->ccid2hctx_last_cong = jiffies;
+
+	hctx->ccid2hctx_cwnd     = hctx->ccid2hctx_cwnd / 2 ? : 1U;
+	hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U);
+
+	/* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
+	if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd)
+		ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd);
 }
 
 static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-	const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
-	struct dccp_ackvec_parsed *avp;
 	u64 ackno, seqno;
 	struct ccid2_seq *seqp;
+	unsigned char *vector;
+	unsigned char veclen;
+	int offset = 0;
 	int done = 0;
 	unsigned int maxincr = 0;
 
+	ccid2_hc_tx_check_sanity(hctx);
 	/* check reverse path congestion */
 	seqno = DCCP_SKB_CB(skb)->dccpd_seq;
 
@@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	 * -sorbo.
 	 */
 	/* need to bootstrap */
-	if (hctx->rpdupack == -1) {
-		hctx->rpdupack = 0;
-		hctx->rpseq = seqno;
+	if (hctx->ccid2hctx_rpdupack == -1) {
+		hctx->ccid2hctx_rpdupack = 0;
+		hctx->ccid2hctx_rpseq = seqno;
 	} else {
 		/* check if packet is consecutive */
-		if (dccp_delta_seqno(hctx->rpseq, seqno) == 1)
-			hctx->rpseq = seqno;
+		if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1)
+			hctx->ccid2hctx_rpseq = seqno;
 		/* it's a later packet */
-		else if (after48(seqno, hctx->rpseq)) {
-			hctx->rpdupack++;
+		else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
+			hctx->ccid2hctx_rpdupack++;
 
 			/* check if we got enough dupacks */
-			if (hctx->rpdupack >= NUMDUPACK) {
-				hctx->rpdupack = -1; /* XXX lame */
-				hctx->rpseq = 0;
+			if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) {
+				hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
+				hctx->ccid2hctx_rpseq = 0;
 
 				ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
 			}
@@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* check forward path congestion */
-	if (dccp_packet_without_ack(skb))
+	/* still didn't send out new data packets */
+	if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
 		return;
 
-	/* still didn't send out new data packets */
-	if (hctx->seqh == hctx->seqt)
-		goto done;
+	switch (DCCP_SKB_CB(skb)->dccpd_type) {
+	case DCCP_PKT_ACK:
+	case DCCP_PKT_DATAACK:
+		break;
+	default:
+		return;
+	}
 
 	ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
-	if (after48(ackno, hctx->high_ack))
-		hctx->high_ack = ackno;
+	if (after48(ackno, hctx->ccid2hctx_high_ack))
+		hctx->ccid2hctx_high_ack = ackno;
 
-	seqp = hctx->seqt;
+	seqp = hctx->ccid2hctx_seqt;
 	while (before48(seqp->ccid2s_seq, ackno)) {
 		seqp = seqp->ccid2s_next;
-		if (seqp == hctx->seqh) {
-			seqp = hctx->seqh->ccid2s_prev;
+		if (seqp == hctx->ccid2hctx_seqh) {
+			seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
 			break;
 		}
 	}
@@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	 * packets per acknowledgement. Rounding up avoids that cwnd is not
 	 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
 	 */
-	if (hctx->cwnd < hctx->ssthresh)
+	if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh)
 		maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
 
 	/* go through all ack vectors */
-	list_for_each_entry(avp, &hctx->av_chunks, node) {
+	while ((offset = ccid2_ackvector(sk, skb, offset,
+					 &vector, &veclen)) != -1) {
 		/* go through this ack vector */
-		for (; avp->len--; avp->vec++) {
-			u64 ackno_end_rl = SUB48(ackno,
-						 dccp_ackvec_runlen(avp->vec));
+		while (veclen--) {
+			const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+			u64 ackno_end_rl = SUB48(ackno, rl);
 
-			ccid2_pr_debug("ackvec %llu |%u,%u|\n",
+			ccid2_pr_debug("ackvec start:%llu end:%llu\n",
 				       (unsigned long long)ackno,
-				       dccp_ackvec_state(avp->vec) >> 6,
-				       dccp_ackvec_runlen(avp->vec));
+				       (unsigned long long)ackno_end_rl);
 			/* if the seqno we are analyzing is larger than the
 			 * current ackno, then move towards the tail of our
 			 * seqnos.
 			 */
 			while (after48(seqp->ccid2s_seq, ackno)) {
-				if (seqp == hctx->seqt) {
+				if (seqp == hctx->ccid2hctx_seqt) {
 					done = 1;
 					break;
 				}
@@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			 * run length
 			 */
 			while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
-				const u8 state = dccp_ackvec_state(avp->vec);
+				const u8 state = *vector &
+						 DCCP_ACKVEC_STATE_MASK;
 
 				/* new packet received or marked */
-				if (state != DCCPAV_NOT_RECEIVED &&
+				if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
 				    !seqp->ccid2s_acked) {
-					if (state == DCCPAV_ECN_MARKED)
+					if (state ==
+					    DCCP_ACKVEC_STATE_ECN_MARKED) {
 						ccid2_congestion_event(sk,
 								       seqp);
-					else
+					} else
 						ccid2_new_ack(sk, seqp,
 							      &maxincr);
 
 					seqp->ccid2s_acked = 1;
 					ccid2_pr_debug("Got ack for %llu\n",
 						       (unsigned long long)seqp->ccid2s_seq);
-					hctx->pipe--;
+					ccid2_hc_tx_dec_pipe(sk);
 				}
-				if (seqp == hctx->seqt) {
+				if (seqp == hctx->ccid2hctx_seqt) {
 					done = 1;
 					break;
 				}
@@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				break;
 
 			ackno = SUB48(ackno_end_rl, 1);
+			vector++;
 		}
 		if (done)
 			break;
@@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	/* The state about what is acked should be correct now
 	 * Check for NUMDUPACK
 	 */
-	seqp = hctx->seqt;
-	while (before48(seqp->ccid2s_seq, hctx->high_ack)) {
+	seqp = hctx->ccid2hctx_seqt;
+	while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) {
 		seqp = seqp->ccid2s_next;
-		if (seqp == hctx->seqh) {
-			seqp = hctx->seqh->ccid2s_prev;
+		if (seqp == hctx->ccid2hctx_seqh) {
+			seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
 			break;
 		}
 	}
@@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			if (done == NUMDUPACK)
 				break;
 		}
-		if (seqp == hctx->seqt)
+		if (seqp == hctx->ccid2hctx_seqt)
 			break;
 		seqp = seqp->ccid2s_prev;
 	}
@@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				 * one ack vector.
 				 */
 				ccid2_congestion_event(sk, seqp);
-				hctx->pipe--;
+				ccid2_hc_tx_dec_pipe(sk);
 			}
-			if (seqp == hctx->seqt)
+			if (seqp == hctx->ccid2hctx_seqt)
 				break;
 			seqp = seqp->ccid2s_prev;
 		}
 
-		hctx->seqt = last_acked;
+		hctx->ccid2hctx_seqt = last_acked;
 	}
 
 	/* trim acked packets in tail */
-	while (hctx->seqt != hctx->seqh) {
-		if (!hctx->seqt->ccid2s_acked)
+	while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
+		if (!hctx->ccid2hctx_seqt->ccid2s_acked)
 			break;
 
-		hctx->seqt = hctx->seqt->ccid2s_next;
+		hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
 	}
 
-	/* restart RTO timer if not all outstanding data has been acked */
-	if (hctx->pipe == 0)
-		sk_stop_timer(sk, &hctx->rtotimer);
-	else
-		sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
-done:
-	/* check if incoming Acks allow pending packets to be sent */
-	if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
-		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
-	dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
+	ccid2_hc_tx_check_sanity(hctx);
 }
 
 static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	u32 max_ratio;
 
 	/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
-	hctx->ssthresh = ~0U;
+	hctx->ccid2hctx_ssthresh  = ~0U;
 
-	/* Use larger initial windows (RFC 3390, rfc2581bis) */
-	hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
+	/*
+	 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
+	 * packets for new connections, following the rules from [RFC3390]".
+	 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
+	 */
+	hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
 
 	/* Make sure that Ack Ratio is enabled and within bounds. */
-	max_ratio = DIV_ROUND_UP(hctx->cwnd, 2);
+	max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2);
 	if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
 		dp->dccps_l_ack_ratio = max_ratio;
 
@@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	if (ccid2_hc_tx_alloc_seq(hctx))
 		return -ENOMEM;
 
-	hctx->rto	= DCCP_TIMEOUT_INIT;
-	hctx->rpdupack  = -1;
-	hctx->last_cong = jiffies;
-	setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
-	INIT_LIST_HEAD(&hctx->av_chunks);
+	hctx->ccid2hctx_rto	 = 3 * HZ;
+	ccid2_change_srtt(hctx, -1);
+	hctx->ccid2hctx_rttvar	 = -1;
+	hctx->ccid2hctx_rpdupack = -1;
+	hctx->ccid2hctx_last_cong = jiffies;
+	setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
+			(unsigned long)sk);
+
+	ccid2_hc_tx_check_sanity(hctx);
 	return 0;
 }
 
@@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk)
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 	int i;
 
-	sk_stop_timer(sk, &hctx->rtotimer);
+	ccid2_hc_tx_kill_rto_timer(sk);
 
-	for (i = 0; i < hctx->seqbufc; i++)
-		kfree(hctx->seqbuf[i]);
-	hctx->seqbufc = 0;
+	for (i = 0; i < hctx->ccid2hctx_seqbufc; i++)
+		kfree(hctx->ccid2hctx_seqbuf[i]);
+	hctx->ccid2hctx_seqbufc = 0;
 }
 
 static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	switch (DCCP_SKB_CB(skb)->dccpd_type) {
 	case DCCP_PKT_DATA:
 	case DCCP_PKT_DATAACK:
-		hcrx->data++;
-		if (hcrx->data >= dp->dccps_r_ack_ratio) {
+		hcrx->ccid2hcrx_data++;
+		if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
 			dccp_send_ack(sk);
-			hcrx->data = 0;
+			hcrx->ccid2hcrx_data = 0;
 		}
 		break;
 	}
 }
 
 static struct ccid_operations ccid2 = {
-	.ccid_id		  = DCCPC_CCID2,
-	.ccid_name		  = "TCP-like",
-	.ccid_owner		  = THIS_MODULE,
-	.ccid_hc_tx_obj_size	  = sizeof(struct ccid2_hc_tx_sock),
-	.ccid_hc_tx_init	  = ccid2_hc_tx_init,
-	.ccid_hc_tx_exit	  = ccid2_hc_tx_exit,
-	.ccid_hc_tx_send_packet	  = ccid2_hc_tx_send_packet,
-	.ccid_hc_tx_packet_sent	  = ccid2_hc_tx_packet_sent,
-	.ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
-	.ccid_hc_tx_packet_recv	  = ccid2_hc_tx_packet_recv,
-	.ccid_hc_rx_obj_size	  = sizeof(struct ccid2_hc_rx_sock),
-	.ccid_hc_rx_packet_recv	  = ccid2_hc_rx_packet_recv,
+	.ccid_id		= DCCPC_CCID2,
+	.ccid_name		= "TCP-like",
+	.ccid_owner		= THIS_MODULE,
+	.ccid_hc_tx_obj_size	= sizeof(struct ccid2_hc_tx_sock),
+	.ccid_hc_tx_init	= ccid2_hc_tx_init,
+	.ccid_hc_tx_exit	= ccid2_hc_tx_exit,
+	.ccid_hc_tx_send_packet	= ccid2_hc_tx_send_packet,
+	.ccid_hc_tx_packet_sent	= ccid2_hc_tx_packet_sent,
+	.ccid_hc_tx_packet_recv	= ccid2_hc_tx_packet_recv,
+	.ccid_hc_rx_obj_size	= sizeof(struct ccid2_hc_rx_sock),
+	.ccid_hc_rx_packet_recv	= ccid2_hc_rx_packet_recv,
 };
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 8b7a2dee2f6..2c94ca02901 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,49 +42,34 @@ struct ccid2_seq {
 
 /** struct ccid2_hc_tx_sock - CCID2 TX half connection
  *
- * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
- * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
- * @srtt: smoothed RTT estimate, scaled by 2^3
- * @mdev: smoothed RTT variation, scaled by 2^2
- * @mdev_max: maximum of @mdev during one flight
- * @rttvar: moving average/maximum of @mdev_max
- * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
- * @rtt_seq: to decay RTTVAR at most once per flight
- * @rpseq: last consecutive seqno
- * @rpdupack: dupacks since rpseq
- * @av_chunks: list of Ack Vectors received on current skb
- */
+ * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
+ * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
+ * @ccid2hctx_lastrtt -time RTT was last measured
+ * @ccid2hctx_rpseq - last consecutive seqno
+ * @ccid2hctx_rpdupack - dupacks since rpseq
+*/
 struct ccid2_hc_tx_sock {
-	u32			cwnd;
-	u32			ssthresh;
-	u32			pipe;
-	u32			packets_acked;
-	struct ccid2_seq	*seqbuf[CCID2_SEQBUF_MAX];
-	int			seqbufc;
-	struct ccid2_seq	*seqh;
-	struct ccid2_seq	*seqt;
-	/* RTT measurement: variables/principles are the same as in TCP */
-	u32			srtt,
-				mdev,
-				mdev_max,
-				rttvar,
-				rto;
-	u64			rtt_seq:48;
-	struct timer_list	rtotimer;
-	u64			rpseq;
-	int			rpdupack;
-	unsigned long		last_cong;
-	u64			high_ack;
-	struct list_head	av_chunks;
+	u32			ccid2hctx_cwnd;
+	u32			ccid2hctx_ssthresh;
+	u32			ccid2hctx_pipe;
+	u32			ccid2hctx_packets_acked;
+	struct ccid2_seq	*ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
+	int			ccid2hctx_seqbufc;
+	struct ccid2_seq	*ccid2hctx_seqh;
+	struct ccid2_seq	*ccid2hctx_seqt;
+	long			ccid2hctx_rto;
+	long			ccid2hctx_srtt;
+	long			ccid2hctx_rttvar;
+	unsigned long		ccid2hctx_lastrtt;
+	struct timer_list	ccid2hctx_rtotimer;
+	u64			ccid2hctx_rpseq;
+	int			ccid2hctx_rpdupack;
+	unsigned long		ccid2hctx_last_cong;
+	u64			ccid2hctx_high_ack;
 };
 
-static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
-{
-	return (hctx->pipe >= hctx->cwnd);
-}
-
 struct ccid2_hc_rx_sock {
-	int			data;
+	int	ccid2hcrx_data;
 };
 
 static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 06cfdad84a6..3b8bd7ca676 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,41 +49,75 @@ static int ccid3_debug;
 /*
  *	Transmitter Half-Connection Routines
  */
-/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */
-static int do_osc_prev = true;
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+	static char *ccid3_state_names[] = {
+	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
+	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+	[TFRC_SSTATE_FBACK]    = "FBACK",
+	[TFRC_SSTATE_TERM]     = "TERM",
+	};
+
+	return ccid3_state_names[state];
+}
+#endif
+
+static void ccid3_hc_tx_set_state(struct sock *sk,
+				  enum ccid3_hc_tx_states state)
+{
+	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+		       ccid3_tx_state_name(state));
+	WARN_ON(state == oldstate);
+	hctx->ccid3hctx_state = state;
+}
 
 /*
  * Compute the initial sending rate X_init in the manner of RFC 3390:
  *
- *	X_init  =  min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT
+ *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
  *
+ * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
+ * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
  * For consistency with other parts of the code, X_init is scaled by 2^6.
  */
 static inline u64 rfc3390_initial_rate(struct sock *sk)
 {
-	const u32 mps = dccp_sk(sk)->dccps_mss_cache,
-	       w_init = clamp(4380U, 2 * mps, 4 * mps);
+	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+	const __u32 w_init = clamp_t(__u32, 4380U,
+			2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
 
-	return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt);
+	return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
 }
 
-/**
- * ccid3_update_send_interval  -  Calculate new t_ipi = s / X
- * This respects the granularity of X (64 * bytes/second) and enforces the
- * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
+/*
+ * Recalculate t_ipi and delta (should be called whenever X changes)
  */
 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
 {
-	if (unlikely(hctx->x <= hctx->s))
-		hctx->x	= hctx->s;
-	hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
+	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
+	hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
+					     hctx->ccid3hctx_x);
+
+	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
+	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
+					   TFRC_OPSYS_HALF_TIME_GRAN);
+
+	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
+		       hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
+		       hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
+
 }
 
 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
 {
-	u32 delta = ktime_us_delta(now, hctx->t_last_win_count);
+	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
 
-	return delta / hctx->rtt;
+	return delta / hctx->ccid3hctx_rtt;
 }
 
 /**
@@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
 static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	u64 min_rate = 2 * hctx->x_recv;
-	const u64 old_x = hctx->x;
+	__u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
+	const  __u64 old_x = hctx->ccid3hctx_x;
 	ktime_t now = stamp ? *stamp : ktime_get_real();
 
 	/*
@@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 	 */
 	if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
 		min_rate = rfc3390_initial_rate(sk);
-		min_rate = max(min_rate, 2 * hctx->x_recv);
+		min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
 	}
 
-	if (hctx->p > 0) {
+	if (hctx->ccid3hctx_p > 0) {
 
-		hctx->x = min(((u64)hctx->x_calc) << 6, min_rate);
+		hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
+					min_rate);
+		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
+					(((__u64)hctx->ccid3hctx_s) << 6) /
+								TFRC_T_MBI);
 
-	} else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) {
+	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
+				- (s64)hctx->ccid3hctx_rtt >= 0) {
 
-		hctx->x = min(2 * hctx->x, min_rate);
-		hctx->x = max(hctx->x,
-			      scaled_div(((u64)hctx->s) << 6, hctx->rtt));
-		hctx->t_ld = now;
+		hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
+		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
+			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
+				       hctx->ccid3hctx_rtt));
+		hctx->ccid3hctx_t_ld = now;
 	}
 
-	if (hctx->x != old_x) {
+	if (hctx->ccid3hctx_x != old_x) {
 		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
 			       "X_recv=%u\n", (unsigned)(old_x >> 6),
-			       (unsigned)(hctx->x >> 6), hctx->x_calc,
-			       (unsigned)(hctx->x_recv >> 6));
+			       (unsigned)(hctx->ccid3hctx_x >> 6),
+			       hctx->ccid3hctx_x_calc,
+			       (unsigned)(hctx->ccid3hctx_x_recv >> 6));
 
 		ccid3_update_send_interval(hctx);
 	}
 }
 
 /*
- * ccid3_hc_tx_measure_packet_size  -  Measuring the packet size `s' (sec 4.1)
- * @new_len: DCCP payload size in bytes (not used by all methods)
+ *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
+ *	@len: DCCP packet payload size in bytes
  */
-static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
+static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
 {
-#if   defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG)
-	return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9);
-#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX)
-	return max(ccid3_hc_tx_sk(sk)->s, new_len);
-#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS	*/
-	return dccp_sk(sk)->dccps_mss_cache;
-#endif
+	const u16 old_s = hctx->ccid3hctx_s;
+
+	hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
+
+	if (hctx->ccid3hctx_s != old_s)
+		ccid3_update_send_interval(hctx);
 }
 
 /*
@@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
 static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
 						ktime_t now)
 {
-	u32 delta = ktime_us_delta(now, hctx->t_last_win_count),
-	    quarter_rtts = (4 * delta) / hctx->rtt;
+	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
+	    quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
 
 	if (quarter_rtts > 0) {
-		hctx->t_last_win_count = now;
-		hctx->last_win_count  += min(quarter_rtts, 5U);
-		hctx->last_win_count  &= 0xF;		/* mod 16 */
+		hctx->ccid3hctx_t_last_win_count = now;
+		hctx->ccid3hctx_last_win_count  += min(quarter_rtts, 5U);
+		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
 	}
 }
 
@@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		goto restart_timer;
 	}
 
-	ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk,
-		       hctx->feedback ? "" : "out");
+	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
+		       ccid3_tx_state_name(hctx->ccid3hctx_state));
 
-	/* Ignore and do not restart after leaving the established state */
-	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+	if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+	else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
 		goto out;
 
-	/* Reset feedback state to "no feedback received" */
-	hctx->feedback = false;
-
 	/*
 	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
-	 * RTO is 0 if and only if no feedback has been received yet.
 	 */
-	if (hctx->t_rto == 0 || hctx->p == 0) {
+	if (hctx->ccid3hctx_t_rto == 0 ||	/* no feedback received yet */
+	    hctx->ccid3hctx_p == 0) {
 
 		/* halve send rate directly */
-		hctx->x /= 2;
+		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
+					(((__u64)hctx->ccid3hctx_s) << 6) /
+								    TFRC_T_MBI);
 		ccid3_update_send_interval(hctx);
-
 	} else {
 		/*
 		 *  Modify the cached value of X_recv
@@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		 *
 		 *  Note that X_recv is scaled by 2^6 while X_calc is not
 		 */
-		BUG_ON(hctx->p && !hctx->x_calc);
+		BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
 
-		if (hctx->x_calc > (hctx->x_recv >> 5))
-			hctx->x_recv /= 2;
+		if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
+			hctx->ccid3hctx_x_recv =
+				max(hctx->ccid3hctx_x_recv / 2,
+				    (((__u64)hctx->ccid3hctx_s) << 6) /
+							      (2 * TFRC_T_MBI));
 		else {
-			hctx->x_recv = hctx->x_calc;
-			hctx->x_recv <<= 4;
+			hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
+			hctx->ccid3hctx_x_recv <<= 4;
 		}
 		ccid3_hc_tx_update_x(sk, NULL);
 	}
 	ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
-			(unsigned long long)hctx->x);
+			(unsigned long long)hctx->ccid3hctx_x);
 
 	/*
 	 * Set new timeout for the nofeedback timer.
 	 * See comments in packet_recv() regarding the value of t_RTO.
 	 */
-	if (unlikely(hctx->t_rto == 0))		/* no feedback received yet */
+	if (unlikely(hctx->ccid3hctx_t_rto == 0))	/* no feedback yet */
 		t_nfb = TFRC_INITIAL_TIMEOUT;
 	else
-		t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
+		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
 
 restart_timer:
-	sk_reset_timer(sk, &hctx->no_feedback_timer,
+	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
 			   jiffies + usecs_to_jiffies(t_nfb));
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
 
-/**
- * ccid3_hc_tx_send_packet  -  Delay-based dequeueing of TX packets
- * @skb: next packet candidate to send on @sk
- * This function uses the convention of ccid_packet_dequeue_eval() and
- * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
+/*
+ * returns
+ *   > 0: delay (in msecs) that should pass before actually sending
+ *   = 0: can send immediately
+ *   < 0: error condition; do not send packet
  */
 static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
@@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(skb->len == 0))
 		return -EBADMSG;
 
-	if (hctx->s == 0) {
-		sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
+	switch (hctx->ccid3hctx_state) {
+	case TFRC_SSTATE_NO_SENT:
+		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+			       (jiffies +
 				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
-		hctx->last_win_count   = 0;
-		hctx->t_last_win_count = now;
+		hctx->ccid3hctx_last_win_count	 = 0;
+		hctx->ccid3hctx_t_last_win_count = now;
 
 		/* Set t_0 for initial packet */
-		hctx->t_nom = now;
+		hctx->ccid3hctx_t_nom = now;
+
+		hctx->ccid3hctx_s = skb->len;
 
 		/*
 		 * Use initial RTT sample when available: recommended by erratum
@@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (dp->dccps_syn_rtt) {
 			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
-			hctx->rtt  = dp->dccps_syn_rtt;
-			hctx->x    = rfc3390_initial_rate(sk);
-			hctx->t_ld = now;
+			hctx->ccid3hctx_rtt  = dp->dccps_syn_rtt;
+			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
+			hctx->ccid3hctx_t_ld = now;
 		} else {
 			/*
 			 * Sender does not have RTT sample:
@@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 			 *   is needed in several parts (e.g.  window counter);
 			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
 			 */
-			hctx->rtt = DCCP_FALLBACK_RTT;
-			hctx->x	  = dp->dccps_mss_cache;
-			hctx->x <<= 6;
+			hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
+			hctx->ccid3hctx_x   = hctx->ccid3hctx_s;
+			hctx->ccid3hctx_x <<= 6;
 		}
-
-		/* Compute t_ipi = s / X */
-		hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
 		ccid3_update_send_interval(hctx);
 
-		/* Seed value for Oscillation Prevention (sec. 4.5) */
-		hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt);
-
-	} else {
-		delay = ktime_us_delta(hctx->t_nom, now);
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+		break;
+	case TFRC_SSTATE_NO_FBACK:
+	case TFRC_SSTATE_FBACK:
+		delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
 		ccid3_pr_debug("delay=%ld\n", (long)delay);
 		/*
 		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
@@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 * else
 		 *       // send the packet in (t_nom - t_now) milliseconds.
 		 */
-		if (delay >= TFRC_T_DELTA)
-			return (u32)delay / USEC_PER_MSEC;
+		if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
+			return (u32)delay / 1000L;
 
 		ccid3_hc_tx_update_win_count(hctx, now);
+		break;
+	case TFRC_SSTATE_TERM:
+		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
+		return -EINVAL;
 	}
 
 	/* prepare to send now (add options etc.) */
 	dp->dccps_hc_tx_insert_options = 1;
-	DCCP_SKB_CB(skb)->dccpd_ccval  = hctx->last_win_count;
+	DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
 
 	/* set the nominal send time for the next following packet */
-	hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi);
-	return CCID_PACKET_SEND_AT_ONCE;
+	hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
+					     hctx->ccid3hctx_t_ipi);
+	return 0;
 }
 
-static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
+				    unsigned int len)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
-	/* Changes to s will become effective the next time X is computed */
-	hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
+	ccid3_hc_tx_update_s(hctx, len);
 
-	if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss))
+	if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
 		DCCP_CRIT("packet history - out of memory!");
 }
 
 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct tfrc_tx_hist_entry *acked;
+	struct ccid3_options_received *opt_recv;
 	ktime_t now;
 	unsigned long t_nfb;
-	u32 r_sample;
+	u32 pinv, r_sample;
 
 	/* we are only interested in ACKs */
 	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
 	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
 		return;
-	/*
-	 * Locate the acknowledged packet in the TX history.
-	 *
-	 * Returning "entry not found" here can for instance happen when
-	 *  - the host has not sent out anything (e.g. a passive server),
-	 *  - the Ack is outdated (packet with higher Ack number was received),
-	 *  - it is a bogus Ack (for a packet not sent on this connection).
-	 */
-	acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb));
-	if (acked == NULL)
+	/* ... and only in the established state */
+	if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
+	    hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+		return;
+
+	opt_recv = &hctx->ccid3hctx_options_received;
+	now = ktime_get_real();
+
+	/* Estimate RTT from history if ACK number is valid */
+	r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
+				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
+	if (r_sample == 0) {
+		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
+			  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
+			  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
 		return;
-	/* For the sake of RTT sampling, ignore/remove all older entries */
-	tfrc_tx_hist_purge(&acked->next);
+	}
 
-	/* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
-	now	  = ktime_get_real();
-	r_sample  = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
-	hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
+	/* Update receive rate in units of 64 * bytes/second */
+	hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
+	hctx->ccid3hctx_x_recv <<= 6;
 
+	/* Update loss event rate (which is scaled by 1e6) */
+	pinv = opt_recv->ccid3or_loss_event_rate;
+	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
+		hctx->ccid3hctx_p = 0;
+	else				       /* can not exceed 100% */
+		hctx->ccid3hctx_p = scaled_div(1, pinv);
+	/*
+	 * Validate new RTT sample and update moving average
+	 */
+	r_sample = dccp_sample_rtt(sk, r_sample);
+	hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
 	 */
-	if (!hctx->feedback) {
-		hctx->feedback = true;
+	if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
 
-		if (hctx->t_rto == 0) {
+		if (hctx->ccid3hctx_t_rto == 0) {
 			/*
 			 * Initial feedback packet: Larger Initial Windows (4.2)
 			 */
-			hctx->x    = rfc3390_initial_rate(sk);
-			hctx->t_ld = now;
+			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
+			hctx->ccid3hctx_t_ld = now;
 
 			ccid3_update_send_interval(hctx);
 
 			goto done_computing_x;
-		} else if (hctx->p == 0) {
+		} else if (hctx->ccid3hctx_p == 0) {
 			/*
 			 * First feedback after nofeedback timer expiry (4.3)
 			 */
@@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
-	if (hctx->p > 0)
-		hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p);
+	if (hctx->ccid3hctx_p > 0)
+		hctx->ccid3hctx_x_calc =
+				tfrc_calc_x(hctx->ccid3hctx_s,
+					    hctx->ccid3hctx_rtt,
+					    hctx->ccid3hctx_p);
 	ccid3_hc_tx_update_x(sk, &now);
 
 done_computing_x:
 	ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
 			       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
-			       dccp_role(sk), sk, hctx->rtt, r_sample,
-			       hctx->s, hctx->p, hctx->x_calc,
-			       (unsigned)(hctx->x_recv >> 6),
-			       (unsigned)(hctx->x >> 6));
-	/*
-	 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to
-	 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
-	 * can be useful if few connections share a link, avoiding that buffer
-	 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
-	 * A useful presentation with background information is in
-	 *    Joerg Widmer, "Equation-Based Congestion Control",
-	 *    MSc Thesis, University of Mannheim, Germany, 2000
-	 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
-	 */
-	if (do_osc_prev) {
-		r_sample = tfrc_scaled_sqrt(r_sample);
-		/*
-		 * The modulation can work in both ways: increase/decrease t_ipi
-		 * according to long-term increases/decreases of the RTT. The
-		 * former is a useful measure, since it works against queue
-		 * build-up. The latter temporarily increases the sending rate,
-		 * so that buffers fill up more quickly. This in turn causes
-		 * the RTT to increase, so that either later reduction becomes
-		 * necessary or the RTT stays at a very high level. Decreasing
-		 * t_ipi is therefore not supported.
-		 * Furthermore, during the initial slow-start phase the RTT
-		 * naturally increases, where using the algorithm would cause
-		 * delays. Hence it is disabled during the initial slow-start.
-		 */
-		if (r_sample > hctx->r_sqmean && hctx->p > 0)
-			hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
-					      hctx->r_sqmean);
-		hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
-		/* update R_sqmean _after_ computing the modulation factor */
-		hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
-	}
+			       dccp_role(sk),
+			       sk, hctx->ccid3hctx_rtt, r_sample,
+			       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
+			       hctx->ccid3hctx_x_calc,
+			       (unsigned)(hctx->ccid3hctx_x_recv >> 6),
+			       (unsigned)(hctx->ccid3hctx_x >> 6));
 
 	/* unschedule no feedback timer */
-	sk_stop_timer(sk, &hctx->no_feedback_timer);
+	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
 
 	/*
 	 * As we have calculated new ipi, delta, t_nom it is possible
@@ -453,66 +488,95 @@ done_computing_x:
 	 * This can help avoid triggering the nofeedback timer too
 	 * often ('spinning') on LANs with small RTTs.
 	 */
-	hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO *
-						 (USEC_PER_SEC / 1000)));
+	hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
+					   (CONFIG_IP_DCCP_CCID3_RTO *
+					    (USEC_PER_SEC / 1000)));
 	/*
 	 * Schedule no feedback timer to expire in
 	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
 	 */
-	t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
+	t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
 
 	ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
 		       "expire in %lu jiffies (%luus)\n",
-		       dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
+		       dccp_role(sk),
+		       sk, usecs_to_jiffies(t_nfb), t_nfb);
 
-	sk_reset_timer(sk, &hctx->no_feedback_timer,
+	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
 			   jiffies + usecs_to_jiffies(t_nfb));
 }
 
-static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
-				     u8 option, u8 *optval, u8 optlen)
+static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
+				     unsigned char len, u16 idx,
+				     unsigned char *value)
 {
+	int rc = 0;
+	const struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+	struct ccid3_options_received *opt_recv;
 	__be32 opt_val;
 
-	switch (option) {
-	case TFRC_OPT_RECEIVE_RATE:
-	case TFRC_OPT_LOSS_EVENT_RATE:
-		/* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
-		if (packet_type == DCCP_PKT_DATA)
-			break;
-		if (unlikely(optlen != 4)) {
-			DCCP_WARN("%s(%p), invalid len %d for %u\n",
-				  dccp_role(sk), sk, optlen, option);
-			return -EINVAL;
-		}
-		opt_val = ntohl(get_unaligned((__be32 *)optval));
+	opt_recv = &hctx->ccid3hctx_options_received;
 
-		if (option == TFRC_OPT_RECEIVE_RATE) {
-			/* Receive Rate is kept in units of 64 bytes/second */
-			hctx->x_recv = opt_val;
-			hctx->x_recv <<= 6;
+	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
+		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
+		opt_recv->ccid3or_loss_event_rate    = ~0;
+		opt_recv->ccid3or_loss_intervals_idx = 0;
+		opt_recv->ccid3or_loss_intervals_len = 0;
+		opt_recv->ccid3or_receive_rate	     = 0;
+	}
 
-			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
-				       dccp_role(sk), sk, opt_val);
+	switch (option) {
+	case TFRC_OPT_LOSS_EVENT_RATE:
+		if (unlikely(len != 4)) {
+			DCCP_WARN("%s(%p), invalid len %d "
+				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
+				  dccp_role(sk), sk, len);
+			rc = -EINVAL;
 		} else {
-			/* Update the fixpoint Loss Event Rate fraction */
-			hctx->p = tfrc_invert_loss_event_rate(opt_val);
-
+			opt_val = get_unaligned((__be32 *)value);
+			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
 			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
-				       dccp_role(sk), sk, opt_val);
+				       dccp_role(sk), sk,
+				       opt_recv->ccid3or_loss_event_rate);
 		}
+		break;
+	case TFRC_OPT_LOSS_INTERVALS:
+		opt_recv->ccid3or_loss_intervals_idx = idx;
+		opt_recv->ccid3or_loss_intervals_len = len;
+		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
+			       dccp_role(sk), sk,
+			       opt_recv->ccid3or_loss_intervals_idx,
+			       opt_recv->ccid3or_loss_intervals_len);
+		break;
+	case TFRC_OPT_RECEIVE_RATE:
+		if (unlikely(len != 4)) {
+			DCCP_WARN("%s(%p), invalid len %d "
+				  "for TFRC_OPT_RECEIVE_RATE\n",
+				  dccp_role(sk), sk, len);
+			rc = -EINVAL;
+		} else {
+			opt_val = get_unaligned((__be32 *)value);
+			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
+				       dccp_role(sk), sk,
+				       opt_recv->ccid3or_receive_rate);
+		}
+		break;
 	}
-	return 0;
+
+	return rc;
 }
 
 static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
 
-	hctx->hist  = NULL;
-	setup_timer(&hctx->no_feedback_timer,
-		    ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
+	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
+	hctx->ccid3hctx_hist = NULL;
+	setup_timer(&hctx->ccid3hctx_no_feedback_timer,
+			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
+
 	return 0;
 }
 
@@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
-	sk_stop_timer(sk, &hctx->no_feedback_timer);
-	tfrc_tx_hist_purge(&hctx->hist);
+	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
+	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+	tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
 }
 
 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
 {
-	info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto;
-	info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt;
+	struct ccid3_hc_tx_sock *hctx;
+
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return;
+
+	hctx = ccid3_hc_tx_sk(sk);
+	info->tcpi_rto = hctx->ccid3hctx_t_rto;
+	info->tcpi_rtt = hctx->ccid3hctx_rtt;
 }
 
 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct tfrc_tx_info tfrc;
+	const struct ccid3_hc_tx_sock *hctx;
 	const void *val;
 
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return -EINVAL;
+
+	hctx = ccid3_hc_tx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_TX_INFO:
-		if (len < sizeof(tfrc))
+		if (len < sizeof(hctx->ccid3hctx_tfrc))
 			return -EINVAL;
-		tfrc.tfrctx_x	   = hctx->x;
-		tfrc.tfrctx_x_recv = hctx->x_recv;
-		tfrc.tfrctx_x_calc = hctx->x_calc;
-		tfrc.tfrctx_rtt	   = hctx->rtt;
-		tfrc.tfrctx_p	   = hctx->p;
-		tfrc.tfrctx_rto	   = hctx->t_rto;
-		tfrc.tfrctx_ipi	   = hctx->t_ipi;
-		len = sizeof(tfrc);
-		val = &tfrc;
+		len = sizeof(hctx->ccid3hctx_tfrc);
+		val = &hctx->ccid3hctx_tfrc;
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 /*
  *	Receiver Half-Connection Routines
  */
+
+/* CCID3 feedback types */
+enum ccid3_fback_type {
+	CCID3_FBACK_NONE = 0,
+	CCID3_FBACK_INITIAL,
+	CCID3_FBACK_PERIODIC,
+	CCID3_FBACK_PARAM_CHANGE
+};
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+	static char *ccid3_rx_state_names[] = {
+	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
+	[TFRC_RSTATE_DATA]    = "DATA",
+	[TFRC_RSTATE_TERM]    = "TERM",
+	};
+
+	return ccid3_rx_state_names[state];
+}
+#endif
+
+static void ccid3_hc_rx_set_state(struct sock *sk,
+				  enum ccid3_hc_rx_states state)
+{
+	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+		       ccid3_rx_state_name(state));
+	WARN_ON(state == oldstate);
+	hcrx->ccid3hcrx_state = state;
+}
+
 static void ccid3_hc_rx_send_feedback(struct sock *sk,
 				      const struct sk_buff *skb,
 				      enum ccid3_fback_type fbtype)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	ktime_t now;
+	s64 delta = 0;
+
+	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
+		return;
+
+	now = ktime_get_real();
 
 	switch (fbtype) {
 	case CCID3_FBACK_INITIAL:
-		hcrx->x_recv = 0;
-		hcrx->p_inverse = ~0U;   /* see RFC 4342, 8.5 */
+		hcrx->ccid3hcrx_x_recv = 0;
+		hcrx->ccid3hcrx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
 		break;
 	case CCID3_FBACK_PARAM_CHANGE:
-		if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
-			/*
-			 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
-			 * FIXME: in rfc3448bis the receiver returns X_recv=0
-			 * here as it normally would in the first feedback packet.
-			 * However this is not possible yet, since the code still
-			 * uses RFC 3448, i.e.
-			 *    If (p > 0)
-			 *      Calculate X_calc using the TCP throughput equation.
-			 *      X = max(min(X_calc, 2*X_recv), s/t_mbi);
-			 * would bring X down to s/t_mbi. That is why we return
-			 * X_recv according to rfc3448bis-06 for the moment.
-			 */
-			u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
-			    rtt = tfrc_rx_hist_rtt(&hcrx->hist);
-
-			hcrx->x_recv = scaled_div32(s, 2 * rtt);
-			break;
-		}
 		/*
 		 * When parameters change (new loss or p > p_prev), we do not
 		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
-		 * always check whether at least RTT time units were covered.
+		 * need to  reuse the previous value of X_recv. However, when
+		 * X_recv was 0 (due to early loss), this would kill X down to
+		 * s/t_mbi (i.e. one packet in 64 seconds).
+		 * To avoid such drastic reduction, we approximate X_recv as
+		 * the number of bytes since last feedback.
+		 * This is a safe fallback, since X is bounded above by X_calc.
 		 */
-		hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
-		break;
+		if (hcrx->ccid3hcrx_x_recv > 0)
+			break;
+		/* fall through */
 	case CCID3_FBACK_PERIODIC:
-		/*
-		 * Step (2) of rfc3448bis-06, 6.2:
-		 * - if no data packets have been received, just restart timer
-		 * - if data packets have been received, re-compute X_recv
-		 */
-		if (hcrx->hist.bytes_recvd == 0)
-			goto prepare_for_next_time;
-		hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
+		delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
+		if (delta <= 0)
+			DCCP_BUG("delta (%ld) <= 0", (long)delta);
+		else
+			hcrx->ccid3hcrx_x_recv =
+				scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
 		break;
 	default:
 		return;
 	}
 
-	ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse);
+	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
+		       hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
 
-	dccp_sk(sk)->dccps_hc_rx_insert_options = 1;
-	dccp_send_ack(sk);
+	hcrx->ccid3hcrx_tstamp_last_feedback = now;
+	hcrx->ccid3hcrx_last_counter	     = dccp_hdr(skb)->dccph_ccval;
+	hcrx->ccid3hcrx_bytes_recv	     = 0;
 
-prepare_for_next_time:
-	tfrc_rx_hist_restart_byte_counter(&hcrx->hist);
-	hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
-	hcrx->feedback	   = fbtype;
+	dp->dccps_hc_rx_insert_options = 1;
+	dccp_send_ack(sk);
 }
 
 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
-	const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	const struct ccid3_hc_rx_sock *hcrx;
 	__be32 x_recv, pinv;
 
 	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
 		return 0;
 
+	hcrx = ccid3_hc_rx_sk(sk);
+
 	if (dccp_packet_without_ack(skb))
 		return 0;
 
-	x_recv = htonl(hcrx->x_recv);
-	pinv   = htonl(hcrx->p_inverse);
+	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
+	pinv   = htonl(hcrx->ccid3hcrx_pinv);
 
 	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
 			       &pinv, sizeof(pinv)) ||
@@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 static u32 ccid3_first_li(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
-	    rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
+	u32 x_recv, p, delta;
 	u64 fval;
 
-	/*
-	 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p
-	 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p
-	 * is about 20.64%. This yields an interval length of 4.84 (rounded up).
-	 */
-	if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
-		return 5;
+	if (hcrx->ccid3hcrx_rtt == 0) {
+		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
+		hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
+	}
 
-	x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
-	if (x_recv == 0)
-		goto failed;
+	delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
+	x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
+	if (x_recv == 0) {		/* would also trigger divide-by-zero */
+		DCCP_WARN("X_recv==0\n");
+		if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
+			DCCP_BUG("stored value of X_recv is zero");
+			return ~0U;
+		}
+	}
 
-	fval = scaled_div32(scaled_div(s, rtt), x_recv);
+	fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
+	fval = scaled_div32(fval, x_recv);
 	p = tfrc_calc_x_reverse_lookup(fval);
 
 	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
 		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
 
-	if (p > 0)
-		return scaled_div(1, p);
-failed:
-	return UINT_MAX;
+	return p == 0 ? ~0U : scaled_div(1, p);
 }
 
 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
 	const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
 	const bool is_data_packet = dccp_data_packet(skb);
 
+	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
+		if (is_data_packet) {
+			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+			do_feedback = CCID3_FBACK_INITIAL;
+			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+			hcrx->ccid3hcrx_s = payload;
+			/*
+			 * Not necessary to update ccid3hcrx_bytes_recv here,
+			 * since X_recv = 0 for the first feedback packet (cf.
+			 * RFC 3448, 6.3) -- gerrit
+			 */
+		}
+		goto update_records;
+	}
+
+	if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
+		return; /* done receiving */
+
+	if (is_data_packet) {
+		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+		/*
+		 * Update moving-average of s and the sum of received payload bytes
+		 */
+		hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
+		hcrx->ccid3hcrx_bytes_recv += payload;
+	}
+
 	/*
 	 * Perform loss detection and handle pending losses
 	 */
-	if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist,
-				     skb, ndp, ccid3_first_li, sk))
-		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE);
+	if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist,
+				skb, ndp, ccid3_first_li, sk)) {
+		do_feedback = CCID3_FBACK_PARAM_CHANGE;
+		goto done_receiving;
+	}
+
+	if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
+		return; /* done receiving */
+
 	/*
-	 * Feedback for first non-empty data packet (RFC 3448, 6.3)
+	 * Handle data packets: RTT sampling and monitoring p
 	 */
-	else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet))
-		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL);
+	if (unlikely(!is_data_packet))
+		goto update_records;
+
+	if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
+		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
+		/*
+		 * Empty loss history: no loss so far, hence p stays 0.
+		 * Sample RTT values, since an RTT estimate is required for the
+		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
+		 */
+		if (sample != 0)
+			hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
+
+	} else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
+		/*
+		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
+		 * has decreased (resp. p has increased), send feedback now.
+		 */
+		do_feedback = CCID3_FBACK_PARAM_CHANGE;
+	}
+
 	/*
 	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
 	 */
-	else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet &&
-		 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
-		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC);
+	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
+		do_feedback = CCID3_FBACK_PERIODIC;
+
+update_records:
+	tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
+
+done_receiving:
+	if (do_feedback)
+		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
 }
 
 static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
 
-	tfrc_lh_init(&hcrx->li_hist);
-	return tfrc_rx_hist_init(&hcrx->hist, sk);
+	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
+	tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
+	return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
 }
 
 static void ccid3_hc_rx_exit(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 
-	tfrc_rx_hist_purge(&hcrx->hist);
-	tfrc_lh_cleanup(&hcrx->li_hist);
+	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
+
+	tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
+	tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
 }
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
+	const struct ccid3_hc_rx_sock *hcrx;
+
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return;
+
+	hcrx = ccid3_hc_rx_sk(sk);
+	info->tcpi_ca_state = hcrx->ccid3hcrx_state;
 	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
-	info->tcpi_rcv_rtt  = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist);
+	info->tcpi_rcv_rtt  = hcrx->ccid3hcrx_rtt;
 }
 
 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	const struct ccid3_hc_rx_sock *hcrx;
 	struct tfrc_rx_info rx_info;
 	const void *val;
 
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return -EINVAL;
+
+	hcrx = ccid3_hc_rx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_RX_INFO:
 		if (len < sizeof(rx_info))
 			return -EINVAL;
-		rx_info.tfrcrx_x_recv = hcrx->x_recv;
-		rx_info.tfrcrx_rtt    = tfrc_rx_hist_rtt(&hcrx->hist);
-		rx_info.tfrcrx_p      = tfrc_invert_loss_event_rate(hcrx->p_inverse);
+		rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
+		rx_info.tfrcrx_rtt    = hcrx->ccid3hcrx_rtt;
+		rx_info.tfrcrx_p      = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
+					   scaled_div(1, hcrx->ccid3hcrx_pinv);
 		len = sizeof(rx_info);
 		val = &rx_info;
 		break;
@@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = {
 	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
 };
 
-module_param(do_osc_prev, bool, 0644);
-MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
-
 #ifdef CONFIG_IP_DCCP_CCID3_DEBUG
 module_param(ccid3_debug, bool, 0644);
 MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
@@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
 
 static __init int ccid3_module_init(void)
 {
-	struct timespec tp;
-
-	/*
-	 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
-	 * correctly and feedback is sent either too early or too late.
-	 */
-	hrtimer_get_res(CLOCK_MONOTONIC, &tp);
-	if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
-		printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
-		       " resolution - check your clocksource.\n", __func__,
-		       tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
-		return -ESOCKTNOSUPPORT;
-	}
 	return ccid_register(&ccid3);
 }
 module_init(ccid3_module_init);
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index af6e1bf937d..49ca32bd7e7 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,22 +47,11 @@
 /* Two seconds as per RFC 3448 4.2 */
 #define TFRC_INITIAL_TIMEOUT	   (2 * USEC_PER_SEC)
 
-/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */
-#define TFRC_T_MBI		   (64 * USEC_PER_SEC)
+/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
+#define TFRC_OPSYS_HALF_TIME_GRAN  (USEC_PER_SEC / (2 * HZ))
 
-/*
- * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
- * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
- * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
- * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
- * when using the constant t_delta  =  t_gran / 2  =  %USEC_PER_SEC / (2 * HZ).
- */
-#if (HZ >= 500)
-# define TFRC_T_DELTA		   USEC_PER_MSEC
-#else
-# define TFRC_T_DELTA		   (USEC_PER_SEC / (2 * HZ))
-#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
-#endif
+/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
+#define TFRC_T_MBI		   64
 
 enum ccid3_options {
 	TFRC_OPT_LOSS_EVENT_RATE = 192,
@@ -70,43 +59,62 @@ enum ccid3_options {
 	TFRC_OPT_RECEIVE_RATE	 = 194,
 };
 
+struct ccid3_options_received {
+	u64 ccid3or_seqno:48,
+	    ccid3or_loss_intervals_idx:16;
+	u16 ccid3or_loss_intervals_len;
+	u32 ccid3or_loss_event_rate;
+	u32 ccid3or_receive_rate;
+};
+
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+	TFRC_SSTATE_NO_SENT = 1,
+	TFRC_SSTATE_NO_FBACK,
+	TFRC_SSTATE_FBACK,
+	TFRC_SSTATE_TERM,
+};
+
 /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
  *
- * @x - Current sending rate in 64 * bytes per second
- * @x_recv - Receive rate    in 64 * bytes per second
- * @x_calc - Calculated rate in bytes per second
- * @rtt - Estimate of current round trip time in usecs
- * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5)
- * @p - Current loss event rate (0-1) scaled by 1000000
- * @s - Packet size in bytes
- * @t_rto - Nofeedback Timer setting in usecs
- * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
- * @feedback - Whether feedback has been received or not
- * @last_win_count - Last window counter sent
- * @t_last_win_count - Timestamp of earliest packet with
- *                     last_win_count value sent
- * @no_feedback_timer - Handle to no feedback timer
- * @t_ld - Time last doubled during slow start
- * @t_nom - Nominal send time of next packet
- * @hist - Packet history
+ * @ccid3hctx_x - Current sending rate in 64 * bytes per second
+ * @ccid3hctx_x_recv - Receive rate    in 64 * bytes per second
+ * @ccid3hctx_x_calc - Calculated rate in bytes per second
+ * @ccid3hctx_rtt - Estimate of current round trip time in usecs
+ * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
+ * @ccid3hctx_s - Packet size in bytes
+ * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs
+ * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
+ * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
+ * @ccid3hctx_last_win_count - Last window counter sent
+ * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
+ *				 with last_win_count value sent
+ * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
+ * @ccid3hctx_t_ld - Time last doubled during slow start
+ * @ccid3hctx_t_nom - Nominal send time of next packet
+ * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
+ * @ccid3hctx_hist - Packet history
+ * @ccid3hctx_options_received - Parsed set of retrieved options
  */
 struct ccid3_hc_tx_sock {
-	u64				x;
-	u64				x_recv;
-	u32				x_calc;
-	u32				rtt;
-	u16				r_sqmean;
-	u32				p;
-	u32				t_rto;
-	u32				t_ipi;
-	u16				s;
-	bool				feedback:1;
-	u8				last_win_count;
-	ktime_t				t_last_win_count;
-	struct timer_list		no_feedback_timer;
-	ktime_t				t_ld;
-	ktime_t				t_nom;
-	struct tfrc_tx_hist_entry	*hist;
+	struct tfrc_tx_info		ccid3hctx_tfrc;
+#define ccid3hctx_x			ccid3hctx_tfrc.tfrctx_x
+#define ccid3hctx_x_recv		ccid3hctx_tfrc.tfrctx_x_recv
+#define ccid3hctx_x_calc		ccid3hctx_tfrc.tfrctx_x_calc
+#define ccid3hctx_rtt			ccid3hctx_tfrc.tfrctx_rtt
+#define ccid3hctx_p			ccid3hctx_tfrc.tfrctx_p
+#define ccid3hctx_t_rto			ccid3hctx_tfrc.tfrctx_rto
+#define ccid3hctx_t_ipi			ccid3hctx_tfrc.tfrctx_ipi
+	u16				ccid3hctx_s;
+	enum ccid3_hc_tx_states		ccid3hctx_state:8;
+	u8				ccid3hctx_last_win_count;
+	ktime_t				ccid3hctx_t_last_win_count;
+	struct timer_list		ccid3hctx_no_feedback_timer;
+	ktime_t				ccid3hctx_t_ld;
+	ktime_t				ccid3hctx_t_nom;
+	u32				ccid3hctx_delta;
+	struct tfrc_tx_hist_entry	*ccid3hctx_hist;
+	struct ccid3_options_received	ccid3hctx_options_received;
 };
 
 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
     return hctx;
 }
 
-
-enum ccid3_fback_type {
-	CCID3_FBACK_NONE = 0,
-	CCID3_FBACK_INITIAL,
-	CCID3_FBACK_PERIODIC,
-	CCID3_FBACK_PARAM_CHANGE
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+	TFRC_RSTATE_NO_DATA = 1,
+	TFRC_RSTATE_DATA,
+	TFRC_RSTATE_TERM    = 127,
 };
 
 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
  *
- *  @last_counter  -  Tracks window counter (RFC 4342, 8.1)
- *  @feedback  -  The type of the feedback last sent
- *  @x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
- *  @tstamp_last_feedback  -  Time at which last feedback was sent
- *  @hist  -  Packet history (loss detection + RTT sampling)
- *  @li_hist  -  Loss Interval database
- *  @p_inverse  -  Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
+ *  @ccid3hcrx_x_recv  -  Receiver estimate of send rate (RFC 3448 4.3)
+ *  @ccid3hcrx_rtt  -  Receiver estimate of rtt (non-standard)
+ *  @ccid3hcrx_p  -  Current loss event rate (RFC 3448 5.4)
+ *  @ccid3hcrx_last_counter  -  Tracks window counter (RFC 4342, 8.1)
+ *  @ccid3hcrx_state  -  Receiver state, one of %ccid3_hc_rx_states
+ *  @ccid3hcrx_bytes_recv  -  Total sum of DCCP payload bytes
+ *  @ccid3hcrx_x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
+ *  @ccid3hcrx_rtt  -  Receiver estimate of RTT
+ *  @ccid3hcrx_tstamp_last_feedback  -  Time at which last feedback was sent
+ *  @ccid3hcrx_tstamp_last_ack  -  Time at which last feedback was sent
+ *  @ccid3hcrx_hist  -  Packet history (loss detection + RTT sampling)
+ *  @ccid3hcrx_li_hist  -  Loss Interval database
+ *  @ccid3hcrx_s  -  Received packet size in bytes
+ *  @ccid3hcrx_pinv  -  Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
  */
 struct ccid3_hc_rx_sock {
-	u8				last_counter:4;
-	enum ccid3_fback_type		feedback:4;
-	u32				x_recv;
-	ktime_t				tstamp_last_feedback;
-	struct tfrc_rx_hist		hist;
-	struct tfrc_loss_hist		li_hist;
-#define p_inverse			li_hist.i_mean
+	u8				ccid3hcrx_last_counter:4;
+	enum ccid3_hc_rx_states		ccid3hcrx_state:8;
+	u32				ccid3hcrx_bytes_recv;
+	u32				ccid3hcrx_x_recv;
+	u32				ccid3hcrx_rtt;
+	ktime_t				ccid3hcrx_tstamp_last_feedback;
+	struct tfrc_rx_hist		ccid3hcrx_hist;
+	struct tfrc_loss_hist		ccid3hcrx_li_hist;
+	u16				ccid3hcrx_s;
+#define ccid3hcrx_pinv			ccid3hcrx_li_hist.i_mean
 };
 
 static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index b1ae8f8259e..5b3ce0688c5 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
 
 /**
  * tfrc_lh_update_i_mean  -  Update the `open' loss interval I_0
- * This updates I_mean as the sequence numbers increase. As a consequence, the
- * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
- * decreases, and thus there is no need to send renewed feedback.
+ * For recomputing p: returns `true' if p > p_prev  <=>  1/p < 1/p_prev
  */
-void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 {
 	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
+	u32 old_i_mean = lh->i_mean;
 	s64 len;
 
 	if (cur == NULL)			/* not initialised */
-		return;
-
-	/* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
-	if (!dccp_data_packet(skb))
-		return;
+		return 0;
 
 	len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
 
 	if (len - (s64)cur->li_length <= 0)	/* duplicate or reordered */
-		return;
+		return 0;
 
 	if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
 		/*
@@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 		cur->li_is_closed = 1;
 
 	if (tfrc_lh_length(lh) == 1)		/* due to RFC 3448, 6.3.1 */
-		return;
+		return 0;
 
 	cur->li_length = len;
 	tfrc_lh_calc_i_mean(lh);
+
+	return (lh->i_mean < old_i_mean);
 }
+EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
 
 /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
 static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
@@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
  * @sk:		   Used by @calc_first_li in caller-specific way (subtyping)
  * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
  */
-bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
-			  u32 (*calc_first_li)(struct sock *), struct sock *sk)
+int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
+			 u32 (*calc_first_li)(struct sock *), struct sock *sk)
 {
 	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
 
 	if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
-		return false;
+		return 0;
 
 	new = tfrc_lh_demand_next(lh);
 	if (unlikely(new == NULL)) {
 		DCCP_CRIT("Cannot allocate/add loss record.");
-		return false;
+		return 0;
 	}
 
 	new->li_seqno	  = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
@@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
 
 		tfrc_lh_calc_i_mean(lh);
 	}
-	return true;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
 
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index d08a226db43..246018a3b26 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
 
 struct tfrc_rx_hist;
 
-extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+extern int  tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
 				 u32 (*first_li)(struct sock *), struct sock *);
-extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+extern u8   tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
 extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
 
 #endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index cce9f03bda3..6cc108afdc3 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -40,6 +40,18 @@
 #include "packet_history.h"
 #include "../../dccp.h"
 
+/**
+ *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
+ *  @next:  next oldest entry (LIFO order)
+ *  @seqno: sequence number of this entry
+ *  @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+	struct tfrc_tx_hist_entry *next;
+	u64			  seqno;
+	ktime_t			  stamp;
+};
+
 /*
  * Transmitter History Routines
  */
@@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void)
 	}
 }
 
+static struct tfrc_tx_hist_entry *
+	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+	while (head != NULL && head->seqno != seqno)
+		head = head->next;
+
+	return head;
+}
+
 int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
 {
 	struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
 }
 EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
 
+u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
+		     const ktime_t now)
+{
+	u32 rtt = 0;
+	struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
+
+	if (packet != NULL) {
+		rtt = ktime_us_delta(now, packet->stamp);
+		/*
+		 * Garbage-collect older (irrelevant) entries:
+		 */
+		tfrc_tx_hist_purge(&packet->next);
+	}
+
+	return rtt;
+}
+EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
+
+
 /*
  * 	Receiver History Routines
  */
@@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
 
-
-static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
-{
-	struct tfrc_rx_hist_entry *tmp = h->ring[a];
-
-	h->ring[a] = h->ring[b];
-	h->ring[b] = tmp;
-}
-
 static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
 {
-	__tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a),
-			       tfrc_rx_hist_index(h, b));
-}
+	const u8 idx_a = tfrc_rx_hist_index(h, a),
+		 idx_b = tfrc_rx_hist_index(h, b);
+	struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
 
-/**
- * tfrc_rx_hist_resume_rtt_sampling  -  Prepare RX history for RTT sampling
- * This is called after loss detection has finished, when the history entry
- * with the index of `loss_count' holds the highest-received sequence number.
- * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt).
- */
-static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h)
-{
-	__tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count));
-	h->loss_count = h->loss_start = 0;
+	h->ring[idx_a] = h->ring[idx_b];
+	h->ring[idx_b] = tmp;
 }
 
 /*
@@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
 	u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
 	    s1 = DCCP_SKB_CB(skb)->dccpd_seq;
 
-	if (!dccp_loss_free(s0, s1, n1))	/* gap between S0 and S1 */
+	if (!dccp_loss_free(s0, s1, n1)) {	/* gap between S0 and S1 */
 		h->loss_count = 1;
+		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
+	}
 }
 
 static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
@@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
 
 		if (dccp_loss_free(s2, s1, n1)) {
 			/* hole is filled: S0, S2, and S1 are consecutive */
-			tfrc_rx_hist_resume_rtt_sampling(h);
+			h->loss_count = 0;
+			h->loss_start = tfrc_rx_hist_index(h, 1);
 		} else
 			/* gap between S2 and S1: just update loss_prev */
 			tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
@@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
 
 			if (dccp_loss_free(s1, s2, n2)) {
 				/* entire hole filled by S0, S3, S1, S2 */
-				tfrc_rx_hist_resume_rtt_sampling(h);
+				h->loss_start = tfrc_rx_hist_index(h, 2);
+				h->loss_count = 0;
 			} else {
 				/* gap remains between S1 and S2 */
 				h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
 
 		if (dccp_loss_free(s2, s3, n3)) {
 			/* no gap between S2 and S3: entire hole is filled */
-			tfrc_rx_hist_resume_rtt_sampling(h);
+			h->loss_start = tfrc_rx_hist_index(h, 3);
+			h->loss_count = 0;
 		} else {
 			/* gap between S2 and S3 */
 			h->loss_start = tfrc_rx_hist_index(h, 2);
@@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
 }
 
 /**
- *  tfrc_rx_congestion_event  -  Loss detection and further processing
- *  @h:		The non-empty RX history object
- *  @lh:	Loss Intervals database to update
- *  @skb:	Currently received packet
- *  @ndp:	The NDP count belonging to @skb
- *  @first_li:	Caller-dependent computation of first loss interval in @lh
- *  @sk:	Used by @calc_first_li (see tfrc_lh_interval_add)
+ *  tfrc_rx_handle_loss  -  Loss detection and further processing
+ *  @h:		    The non-empty RX history object
+ *  @lh:	    Loss Intervals database to update
+ *  @skb:	    Currently received packet
+ *  @ndp:	    The NDP count belonging to @skb
+ *  @calc_first_li: Caller-dependent computation of first loss interval in @lh
+ *  @sk:	    Used by @calc_first_li (see tfrc_lh_interval_add)
  *  Chooses action according to pending loss, updates LI database when a new
  *  loss was detected, and does required post-processing. Returns 1 when caller
  *  should send feedback, 0 otherwise.
@@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
  *  records accordingly, the caller should not perform any more RX history
  *  operations when loss_count is greater than 0 after calling this function.
  */
-bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
-			      struct tfrc_loss_hist *lh,
-			      struct sk_buff *skb, const u64 ndp,
-			      u32 (*first_li)(struct sock *), struct sock *sk)
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+			struct tfrc_loss_hist *lh,
+			struct sk_buff *skb, const u64 ndp,
+			u32 (*calc_first_li)(struct sock *), struct sock *sk)
 {
-	bool new_event = false;
-
-	if (tfrc_rx_hist_duplicate(h, skb))
-		return 0;
+	int is_new_loss = 0;
 
 	if (h->loss_count == 0) {
 		__do_track_loss(h, skb, ndp);
-		tfrc_rx_hist_sample_rtt(h, skb);
-		tfrc_rx_hist_add_packet(h, skb, ndp);
 	} else if (h->loss_count == 1) {
 		__one_after_loss(h, skb, ndp);
 	} else if (h->loss_count != 2) {
@@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
 		/*
 		 * Update Loss Interval database and recycle RX records
 		 */
-		new_event = tfrc_lh_interval_add(lh, h, first_li, sk);
+		is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
 		__three_after_loss(h);
 	}
-
-	/*
-	 * Update moving-average of `s' and the sum of received payload bytes.
-	 */
-	if (dccp_data_packet(skb)) {
-		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
-
-		h->packet_size = tfrc_ewma(h->packet_size, payload, 9);
-		h->bytes_recvd += payload;
-	}
-
-	/* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
-	if (!new_event)
-		tfrc_lh_update_i_mean(lh, skb);
-
-	return new_event;
+	return is_new_loss;
 }
-EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event);
+EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
 
-/* Compute the sending rate X_recv measured between feedback intervals */
-u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
 {
-	u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate;
-	s64 delta = ktime_to_us(net_timedelta(h->bytes_start));
-
-	WARN_ON(delta <= 0);
-	/*
-	 * Ensure that the sampling interval for X_recv is at least one RTT,
-	 * by extending the sampling interval backwards in time, over the last
-	 * R_(m-1) seconds, as per rfc3448bis-06, 6.2.
-	 * To reduce noise (e.g. when the RTT changes often), this is only
-	 * done when delta is smaller than RTT/2.
-	 */
-	if (last_x_recv > 0 && delta < last_rtt/2) {
-		tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n",
-			      (long)delta, (unsigned)last_rtt);
+	int i;
 
-		delta = (bytes ? delta : 0) + last_rtt;
-		bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC);
+	for (i = 0; i <= TFRC_NDUPACK; i++) {
+		h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
+		if (h->ring[i] == NULL)
+			goto out_free;
 	}
 
-	if (unlikely(bytes == 0)) {
-		DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv);
-		return last_x_recv;
+	h->loss_count = h->loss_start = 0;
+	return 0;
+
+out_free:
+	while (i-- != 0) {
+		kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+		h->ring[i] = NULL;
 	}
-	return scaled_div32(bytes, delta);
+	return -ENOBUFS;
 }
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv);
+EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc);
 
 void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
 {
@@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
 
-static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
+/**
+ * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
 {
-	int i;
-
-	memset(h, 0, sizeof(*h));
-
-	for (i = 0; i <= TFRC_NDUPACK; i++) {
-		h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
-		if (h->ring[i] == NULL) {
-			tfrc_rx_hist_purge(h);
-			return -ENOBUFS;
-		}
-	}
-	return 0;
+	return h->ring[0];
 }
 
-int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk)
+/**
+ * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
 {
-	if (tfrc_rx_hist_alloc(h))
-		return -ENOBUFS;
-	/*
-	 * Initialise first entry with GSR to start loss detection as early as
-	 * possible. Code using this must not use any other fields. The entry
-	 * will be overwritten once the CCID updates its received packets.
-	 */
-	tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr;
-	return 0;
+	return h->ring[h->rtt_sample_prev];
 }
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
 
 /**
  * tfrc_rx_hist_sample_rtt  -  Sample RTT from timestamp / CCVal
- * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss
- * is pending and uses the following history entries (via rtt_sample_prev):
- * - h->ring[0]  contains the most recent history entry prior to @skb;
- * - h->ring[1]  is an unused `dummy' entry when the current difference is 0;
+ * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
+ * to compute a sample with given data - calling function should check this.
  */
-void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
 {
-	struct tfrc_rx_hist_entry *last = h->ring[0];
-	u32 sample, delta_v;
-
-	/*
-	 * When not to sample:
-	 * - on non-data packets
-	 *   (RFC 4342, 8.1: CCVal only fully defined for data packets);
-	 * - when no data packets have been received yet
-	 *   (FIXME: using sampled packet size as indicator here);
-	 * - as long as there are gaps in the sequence space (pending loss).
-	 */
-	if (!dccp_data_packet(skb) || h->packet_size == 0 ||
-	    tfrc_rx_hist_loss_pending(h))
-		return;
+	u32 sample = 0,
+	    delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
+			    tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+
+	if (delta_v < 1 || delta_v > 4) {	/* unsuitable CCVal delta */
+		if (h->rtt_sample_prev == 2) {	/* previous candidate stored */
+			sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+				       tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+			if (sample)
+				sample = 4 / sample *
+				         ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
+							tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
+			else    /*
+				 * FIXME: This condition is in principle not
+				 * possible but occurs when CCID is used for
+				 * two-way data traffic. I have tried to trace
+				 * it, but the cause does not seem to be here.
+				 */
+				DCCP_BUG("please report to dccp@vger.kernel.org"
+					 " => prev = %u, last = %u",
+					 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+					 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+		} else if (delta_v < 1) {
+			h->rtt_sample_prev = 1;
+			goto keep_ref_for_next_time;
+		}
 
-	h->rtt_sample_prev = 0;		/* reset previous candidate */
+	} else if (delta_v == 4) /* optimal match */
+		sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
+	else {			 /* suboptimal match */
+		h->rtt_sample_prev = 2;
+		goto keep_ref_for_next_time;
+	}
 
-	delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval);
-	if (delta_v == 0) {		/* less than RTT/4 difference */
-		h->rtt_sample_prev = 1;
-		return;
+	if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
+		DCCP_WARN("RTT sample %u too large, using max\n", sample);
+		sample = DCCP_SANE_RTT_MAX;
 	}
-	sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp)));
 
-	if (delta_v <= 4)		/* between RTT/4 and RTT */
-		sample *= 4 / delta_v;
-	else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2))
-		/*
-		* Optimisation: CCVal difference is greater than 1 RTT, yet the
-		* sample is less than the local RTT estimate; which means that
-		* the RTT estimate is too high.
-		* To avoid noise, it is not done if the sample is below RTT/2.
-		*/
-		return;
+	h->rtt_sample_prev = 0;	       /* use current entry as next reference */
+keep_ref_for_next_time:
 
-	/* Use a lower weight than usual to increase responsiveness */
-	h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5);
+	return sample;
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 555e65cd73a..461cc91cce8 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,28 +40,12 @@
 #include <linux/slab.h>
 #include "tfrc.h"
 
-/**
- *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
- *  @next:  next oldest entry (LIFO order)
- *  @seqno: sequence number of this entry
- *  @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
-	struct tfrc_tx_hist_entry *next;
-	u64			  seqno;
-	ktime_t			  stamp;
-};
-
-static inline struct tfrc_tx_hist_entry *
-	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
-	while (head != NULL && head->seqno != seqno)
-		head = head->next;
-	return head;
-}
+struct tfrc_tx_hist_entry;
 
 extern int  tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
 extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
+extern u32  tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
+			     const u64 seqno, const ktime_t now);
 
 /* Subtraction a-b modulo-16, respects circular wrap-around */
 #define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry {
  * @loss_count:		Number of entries in circular history
  * @loss_start:		Movable index (for loss detection)
  * @rtt_sample_prev:	Used during RTT sampling, points to candidate entry
- * @rtt_estimate:	Receiver RTT estimate
- * @packet_size:	Packet size in bytes (as per RFC 3448, 3.1)
- * @bytes_recvd:	Number of bytes received since @bytes_start
- * @bytes_start:	Start time for counting @bytes_recvd
  */
 struct tfrc_rx_hist {
 	struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
 	u8			  loss_count:2,
 				  loss_start:2;
-	/* Receiver RTT sampling */
 #define rtt_sample_prev		  loss_start
-	u32			  rtt_estimate;
-	/* Receiver sampling of application payload lengths */
-	u32			  packet_size,
-				  bytes_recvd;
-	ktime_t			  bytes_start;
 };
 
 /**
@@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
 	return h->loss_count > 0;
 }
 
-/*
- * Accessor functions to retrieve parameters sampled by the RX history
- */
-static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
-{
-	if (h->packet_size == 0) {
-		DCCP_WARN("No sample for s, using fallback\n");
-		return TCP_MIN_RCVMSS;
-	}
-	return h->packet_size;
-
-}
-static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
-{
-	if (h->rtt_estimate == 0) {
-		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
-		return  DCCP_FALLBACK_RTT;
-	}
-	return h->rtt_estimate;
-}
-
-static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
-{
-	h->bytes_recvd = 0;
-	h->bytes_start = ktime_get_real();
-}
-
-extern u32  tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
-
-
 extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
 				    const struct sk_buff *skb, const u64 ndp);
 
 extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
 
 struct tfrc_loss_hist;
-extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
-				     struct tfrc_loss_hist *lh,
-				     struct sk_buff *skb, const u64 ndp,
-				     u32 (*first_li)(struct sock *sk),
-				     struct sock *sk);
-extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
-				    const struct sk_buff *skb);
-extern int  tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
+extern int  tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+				struct tfrc_loss_hist *lh,
+				struct sk_buff *skb, const u64 ndp,
+				u32 (*first_li)(struct sock *sk),
+				struct sock *sk);
+extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
+				   const struct sk_buff *skb);
+extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
 extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
 
 #endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ede12f53de5..ed9857527ac 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -47,21 +47,6 @@ static inline u32 scaled_div32(u64 a, u64 b)
 	return result;
 }
 
-/**
- * tfrc_scaled_sqrt  -  Compute scaled integer sqrt(x) for 0 < x < 2^22-1
- * Uses scaling to improve accuracy of the integer approximation of sqrt(). The
- * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
- * clamped RTT samples (dccp_sample_rtt).
- * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
- * scaling factor is neutralised. For this purpose, it avoids returning zero.
- */
-static inline u16 tfrc_scaled_sqrt(const u32 sample)
-{
-	const unsigned long non_zero_sample = sample ? : 1;
-
-	return int_sqrt(non_zero_sample << 10);
-}
-
 /**
  * tfrc_ewma  -  Exponentially weighted moving average
  * @weight: Weight to be used as damping factor, in units of 1/10
@@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
 
 extern u32  tfrc_calc_x(u16 s, u32 R, u32 p);
 extern u32  tfrc_calc_x_reverse_lookup(u32 fvalue);
-extern u32  tfrc_invert_loss_event_rate(u32 loss_event_rate);
 
 extern int  tfrc_tx_packet_history_init(void);
 extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 38239c4d5e1..2f20a29cffe 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 
 	if (p <= TFRC_CALC_X_SPLIT) 		{     /* 0.0000 < p <= 0.05   */
 		if (p < TFRC_SMALLEST_P) {	      /* 0.0000 < p <  0.0001 */
-			/*
-			 * In the congestion-avoidance phase p decays towards 0
-			 * when there are no further losses, so this case is
-			 * natural. Truncating to p_min = 0.01% means that the
-			 * maximum achievable throughput is limited to about
-			 * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
-			 * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
-			 */
-			tfrc_pr_debug("Value of p (%d) below resolution. "
-				      "Substituting %d\n", p, TFRC_SMALLEST_P);
+			DCCP_WARN("Value of p (%d) below resolution. "
+				  "Substituting %d\n", p, TFRC_SMALLEST_P);
 			index = 0;
 		} else 				      /* 0.0001 <= p <= 0.05  */
 			index =  p/TFRC_SMALLEST_P - 1;
@@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 	result = scaled_div(s, R);
 	return scaled_div32(result, f);
 }
+
 EXPORT_SYMBOL_GPL(tfrc_calc_x);
 
 /**
@@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 	index = tfrc_binsearch(fvalue, 0);
 	return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
 }
-EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
 
-/**
- * tfrc_invert_loss_event_rate  -  Compute p so that 10^6 corresponds to 100%
- * When @loss_event_rate is large, there is a chance that p is truncated to 0.
- * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
- */
-u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
-{
-	if (loss_event_rate == UINT_MAX)		/* see RFC 4342, 8.5 */
-		return 0;
-	if (unlikely(loss_event_rate == 0))		/* map 1/0 into 100% */
-		return 1000000;
-	return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
-}
-EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);
+EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 5281190aa19..b4bc6e095a0 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -42,11 +42,9 @@
 extern int dccp_debug;
 #define dccp_pr_debug(format, a...)	  DCCP_PR_DEBUG(dccp_debug, format, ##a)
 #define dccp_pr_debug_cat(format, a...)   DCCP_PRINTK(dccp_debug, format, ##a)
-#define dccp_debug(fmt, a...)		  dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
 #else
 #define dccp_pr_debug(format, a...)
 #define dccp_pr_debug_cat(format, a...)
-#define dccp_debug(format, a...)
 #endif
 
 extern struct inet_hashinfo dccp_hashinfo;
@@ -63,14 +61,11 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
  *    - DCCP-Reset    with ACK Subheader and 4 bytes of Reset Code fields
  *  Hence a safe upper bound for the maximum option length is 1020-28 = 992
  */
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
 #define DCCP_MAX_PACKET_HDR 28
 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
 #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
 
-/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
-#define DCCP_FEATNEG_OVERHEAD	 (32 * sizeof(uint32_t))
-
 #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
 				     * state, about 60 seconds */
 
@@ -86,13 +81,10 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define DCCP_RTO_MAX ((unsigned)(64 * HZ))
 
-/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */
-#define DCCP_TIME_RESOLUTION	10
-
 /*
  * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
  */
-#define DCCP_SANE_RTT_MIN	(10 * DCCP_TIME_RESOLUTION)
+#define DCCP_SANE_RTT_MIN	100
 #define DCCP_FALLBACK_RTT	(USEC_PER_SEC / 5)
 #define DCCP_SANE_RTT_MAX	(3 * USEC_PER_SEC)
 
@@ -103,6 +95,12 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
 extern int  sysctl_dccp_request_retries;
 extern int  sysctl_dccp_retries1;
 extern int  sysctl_dccp_retries2;
+extern int  sysctl_dccp_feat_sequence_window;
+extern int  sysctl_dccp_feat_rx_ccid;
+extern int  sysctl_dccp_feat_tx_ccid;
+extern int  sysctl_dccp_feat_ack_ratio;
+extern int  sysctl_dccp_feat_send_ack_vector;
+extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
@@ -237,22 +235,8 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
 			   const enum dccp_pkt_type pkt_type);
 
-/*
- * TX Packet Dequeueing Interface
- */
-extern void		dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
-extern bool		dccp_qpolicy_full(struct sock *sk);
-extern void		dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
-extern struct sk_buff	*dccp_qpolicy_top(struct sock *sk);
-extern struct sk_buff	*dccp_qpolicy_pop(struct sock *sk);
-extern bool		dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
-
-/*
- * TX Packet Output and TX Timers
- */
-extern void dccp_write_xmit(struct sock *sk);
+extern void dccp_write_xmit(struct sock *sk, int block);
 extern void dccp_write_space(struct sock *sk);
-extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
 
 extern void dccp_init_xmit_timers(struct sock *sk);
 static inline void dccp_clear_xmit_timers(struct sock *sk)
@@ -268,8 +252,7 @@ extern const char *dccp_state_name(const int state);
 extern void dccp_set_state(struct sock *sk, const int state);
 extern void dccp_done(struct sock *sk);
 
-extern int  dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
-			    struct sk_buff const *skb);
+extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
 
 extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 
@@ -334,14 +317,7 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
 extern int	   dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
 extern void	   dccp_send_close(struct sock *sk, const int active);
 extern int	   dccp_invalid_packet(struct sk_buff *skb);
-
-static inline u32  dccp_sane_rtt(long usec_sample)
-{
-	if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX))
-		DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample);
-	return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX);
-}
-extern u32 dccp_sample_rtt(struct sock *sk, long delta);
+extern u32	   dccp_sample_rtt(struct sock *sk, long delta);
 
 static inline int dccp_bad_service_code(const struct sock *sk,
 					const __be32 service)
@@ -435,62 +411,36 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
 static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	const struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	dp->dccps_gsr = seq;
-	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
-	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
-	/*
-	 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
-	 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
-	 * always > 32, so for the first W/W' packets in the lifetime of a
-	 * connection we always have to adjust SWL.
-	 * A second reason why we are doing this is that the window depends on
-	 * the feature-remote value of Sequence Window: nothing stops the peer
-	 * from updating this value while we are busy adjusting SWL for the
-	 * first W packets (we would have to count from scratch again then).
-	 * Therefore it is safer to always make sure that the Sequence Window
-	 * is not artificially extended by a peer who grows SWL downwards by
-	 * continually updating the feature-remote Sequence-Window.
-	 * If sequence numbers wrap it is bad luck. But that will take a while
-	 * (48 bit), and this measure prevents Sequence-number attacks.
-	 */
-	if (before48(dp->dccps_swl, dp->dccps_isr))
-		dp->dccps_swl = dp->dccps_isr;
-	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
+	dccp_set_seqno(&dp->dccps_swl,
+		       dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
+	dccp_set_seqno(&dp->dccps_swh,
+		       dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
 }
 
 static inline void dccp_update_gss(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	dp->dccps_gss = seq;
-	/* Ack validity window depends on local Sequence Window value (7.5.1) */
-	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
-	/* Adjust AWL so that it is not below ISS - see comment above for SWL */
-	if (before48(dp->dccps_awl, dp->dccps_iss))
-		dp->dccps_awl = dp->dccps_iss;
-	dp->dccps_awh = dp->dccps_gss;
-}
-
-static inline int dccp_ackvec_pending(const struct sock *sk)
-{
-	return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
-	       !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
+	dp->dccps_awh = dp->dccps_gss = seq;
+	dccp_set_seqno(&dp->dccps_awl,
+		       (dp->dccps_gss -
+			dccp_msk(sk)->dccpms_sequence_window + 1));
 }
 
 static inline int dccp_ack_pending(const struct sock *sk)
 {
-	return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
+	const struct dccp_sock *dp = dccp_sk(sk);
+	return dp->dccps_timestamp_echo != 0 ||
+#ifdef CONFIG_IP_DCCP_ACKVEC
+	       (dccp_msk(sk)->dccpms_send_ack_vector &&
+		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
+#endif
+	       inet_csk_ack_scheduled(sk);
 }
 
-extern int  dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
-extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
-extern int  dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
-extern int  dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
-				  struct sk_buff *skb);
-extern int  dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
-extern void dccp_feat_list_purge(struct list_head *fn_list);
-
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
 extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
 extern int dccp_insert_option_elapsed_time(struct sock *sk,
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 93aae7c9555..d8a3509b26f 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_backoff	= icsk->icsk_backoff;
 	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
-	if (dp->dccps_hc_rx_ackvec != NULL)
+	if (dccp_msk(sk)->dccpms_send_ack_vector)
 		info->tcpi_options |= TCPI_OPT_SACK;
 
 	ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index f94c7c9d1a7..933a0ecf8d4 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1,19 +1,11 @@
 /*
  *  net/dccp/feat.c
  *
- *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- *
- *  Copyright (c) 2008 The University of Aberdeen, Scotland, UK
- *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
- *  Rewrote from scratch, some bits from earlier code by
- *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
+ *  An implementation of the DCCP protocol
+ *  Andrea Bittau <a.bittau@cs.ucl.ac.uk>
  *
  *  ASSUMPTIONS
  *  -----------
- *  o Feature negotiation is coordinated with connection setup (as in TCP), wild
- *    changes of parameters of an established connection are not supported.
- *  o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN.
  *  o All currently known SP features have 1-byte quantities. If in the future
  *    extensions of RFCs 4340..42 define features with item lengths larger than
  *    one byte, a feature-specific extension of the code will be required.
@@ -23,1510 +15,635 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  */
+
 #include <linux/module.h>
+
 #include "ccid.h"
 #include "feat.h"
 
-/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
-unsigned long	sysctl_dccp_sequence_window __read_mostly = 100;
-int		sysctl_dccp_rx_ccid	    __read_mostly = 2,
-		sysctl_dccp_tx_ccid	    __read_mostly = 2;
+#define DCCP_FEAT_SP_NOAGREE (-123)
 
-/*
- * Feature activation handlers.
- *
- * These all use an u64 argument, to provide enough room for NN/SP features. At
- * this stage the negotiated values have been checked to be within their range.
- */
-static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
+int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
+		     u8 *val, u8 len, gfp_t gfp)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any());
+	struct dccp_opt_pend *opt;
 
-	if (new_ccid == NULL)
-		return -ENOMEM;
+	dccp_feat_debug(type, feature, *val);
 
-	if (rx) {
-		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-		dp->dccps_hc_rx_ccid = new_ccid;
-	} else {
-		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-		dp->dccps_hc_tx_ccid = new_ccid;
+	if (len > 3) {
+		DCCP_WARN("invalid length %d\n", len);
+		return -EINVAL;
+	}
+	/* XXX add further sanity checks */
+
+	/* check if that feature is already being negotiated */
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		/* ok we found a negotiation for this option already */
+		if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
+			dccp_pr_debug("Replacing old\n");
+			/* replace */
+			BUG_ON(opt->dccpop_val == NULL);
+			kfree(opt->dccpop_val);
+			opt->dccpop_val	 = val;
+			opt->dccpop_len	 = len;
+			opt->dccpop_conf = 0;
+			return 0;
+		}
 	}
-	return 0;
-}
 
-static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
+	/* negotiation for a new feature */
+	opt = kmalloc(sizeof(*opt), gfp);
+	if (opt == NULL)
+		return -ENOMEM;
 
-	if (rx) {
-		dp->dccps_r_seq_win = seq_win;
-		/* propagate changes to update SWL/SWH */
-		dccp_update_gsr(sk, dp->dccps_gsr);
-	} else {
-		dp->dccps_l_seq_win = seq_win;
-		/* propagate changes to update AWL */
-		dccp_update_gss(sk, dp->dccps_gss);
-	}
-	return 0;
-}
+	opt->dccpop_type = type;
+	opt->dccpop_feat = feature;
+	opt->dccpop_len	 = len;
+	opt->dccpop_val	 = val;
+	opt->dccpop_conf = 0;
+	opt->dccpop_sc	 = NULL;
 
-static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
-{
-#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__
-	/*
-	 * FIXME: This is required until several problems in the CCID-2 code are
-	 * resolved. The CCID-2 code currently does not cope well; using dynamic
-	 * Ack Ratios greater than 1 caused instabilities. These were manifest
-	 * in hangups and long RTO timeouts (1...3 seconds). Until this has been
-	 * stabilised, it is safer not to activate dynamic Ack Ratio changes.
-	 */
-	dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n",
-		      rx ? "RX" : "TX", (u16)ratio);
-	ratio = 1;
-#endif
-	if (rx)
-		dccp_sk(sk)->dccps_r_ack_ratio = ratio;
-	else
-		dccp_sk(sk)->dccps_l_ack_ratio = ratio;
+	BUG_ON(opt->dccpop_val == NULL);
+
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
 	return 0;
 }
 
-static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
+EXPORT_SYMBOL_GPL(dccp_feat_change);
+
+static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	/* figure out if we are changing our CCID or the peer's */
+	const int rx = type == DCCPO_CHANGE_R;
+	const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
+	struct ccid *new_ccid;
+
+	/* Check if nothing is being changed. */
+	if (ccid_nr == new_ccid_nr)
+		return 0;
+
+	new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC);
+	if (new_ccid == NULL)
+		return -ENOMEM;
 
 	if (rx) {
-		if (enable && dp->dccps_hc_rx_ackvec == NULL) {
-			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
-			if (dp->dccps_hc_rx_ackvec == NULL)
-				return -ENOMEM;
-		} else if (!enable) {
-			dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
-			dp->dccps_hc_rx_ackvec = NULL;
-		}
+		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+		dp->dccps_hc_rx_ccid = new_ccid;
+		dmsk->dccpms_rx_ccid = new_ccid_nr;
+	} else {
+		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+		dp->dccps_hc_tx_ccid = new_ccid;
+		dmsk->dccpms_tx_ccid = new_ccid_nr;
 	}
-	return 0;
-}
 
-static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
-{
-	if (!rx)
-		dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
 	return 0;
 }
 
-/*
- * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
- * `rx' holds when the sending peer informs about his partial coverage via a
- * ChangeR() option. In the other case, we are the sender and the receiver
- * announces its coverage via ChangeL() options. The policy here is to honour
- * such communication by enabling the corresponding partial coverage - but only
- * if it has not been set manually before; the warning here means that all
- * packets will be dropped.
- */
-static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
+static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	dccp_feat_debug(type, feat, val);
 
-	if (rx)
-		dp->dccps_pcrlen = cscov;
-	else {
-		if (dp->dccps_pcslen == 0)
-			dp->dccps_pcslen = cscov;
-		else if (cscov > dp->dccps_pcslen)
-			DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
-				  dp->dccps_pcslen, (u8)cscov);
+	switch (feat) {
+	case DCCPF_CCID:
+		return dccp_feat_update_ccid(sk, type, val);
+	default:
+		dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n",
+			      dccp_feat_typename(type), feat);
+		break;
 	}
 	return 0;
 }
 
-static const struct {
-	u8			feat_num;		/* DCCPF_xxx */
-	enum dccp_feat_type	rxtx;			/* RX or TX  */
-	enum dccp_feat_type	reconciliation;		/* SP or NN  */
-	u8			default_value;		/* as in 6.4 */
-	int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
-/*
- *    Lookup table for location and type of features (from RFC 4340/4342)
- *  +--------------------------+----+-----+----+----+---------+-----------+
- *  | Feature                  | Location | Reconc. | Initial |  Section  |
- *  |                          | RX | TX  | SP | NN |  Value  | Reference |
- *  +--------------------------+----+-----+----+----+---------+-----------+
- *  | DCCPF_CCID               |    |  X  | X  |    |   2     | 10        |
- *  | DCCPF_SHORT_SEQNOS       |    |  X  | X  |    |   0     |  7.6.1    |
- *  | DCCPF_SEQUENCE_WINDOW    |    |  X  |    | X  | 100     |  7.5.2    |
- *  | DCCPF_ECN_INCAPABLE      | X  |     | X  |    |   0     | 12.1      |
- *  | DCCPF_ACK_RATIO          |    |  X  |    | X  |   2     | 11.3      |
- *  | DCCPF_SEND_ACK_VECTOR    | X  |     | X  |    |   0     | 11.5      |
- *  | DCCPF_SEND_NDP_COUNT     |    |  X  | X  |    |   0     |  7.7.2    |
- *  | DCCPF_MIN_CSUM_COVER     | X  |     | X  |    |   0     |  9.2.1    |
- *  | DCCPF_DATA_CHECKSUM      | X  |     | X  |    |   0     |  9.3.1    |
- *  | DCCPF_SEND_LEV_RATE      | X  |     | X  |    |   0     | 4342/8.4  |
- *  +--------------------------+----+-----+----+----+---------+-----------+
- */
-} dccp_feat_table[] = {
-	{ DCCPF_CCID,		 FEAT_AT_TX, FEAT_SP, 2,   dccp_hdlr_ccid     },
-	{ DCCPF_SHORT_SEQNOS,	 FEAT_AT_TX, FEAT_SP, 0,   NULL },
-	{ DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win  },
-	{ DCCPF_ECN_INCAPABLE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
-	{ DCCPF_ACK_RATIO,	 FEAT_AT_TX, FEAT_NN, 2,   dccp_hdlr_ack_ratio},
-	{ DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_ackvec   },
-	{ DCCPF_SEND_NDP_COUNT,  FEAT_AT_TX, FEAT_SP, 0,   dccp_hdlr_ndp      },
-	{ DCCPF_MIN_CSUM_COVER,  FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_min_cscov},
-	{ DCCPF_DATA_CHECKSUM,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
-	{ DCCPF_SEND_LEV_RATE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
-};
-#define DCCP_FEAT_SUPPORTED_MAX		ARRAY_SIZE(dccp_feat_table)
-
-/**
- * dccp_feat_index  -  Hash function to map feature number into array position
- * Returns consecutive array index or -1 if the feature is not understood.
- */
-static int dccp_feat_index(u8 feat_num)
+static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
+			       u8 *rpref, u8 rlen)
 {
-	/* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
-	if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
-		return feat_num - 1;
+	struct dccp_sock *dp = dccp_sk(sk);
+	u8 *spref, slen, *res = NULL;
+	int i, j, rc, agree = 1;
 
+	BUG_ON(rpref == NULL);
+
+	/* check if we are the black sheep */
+	if (dp->dccps_role == DCCP_ROLE_CLIENT) {
+		spref = rpref;
+		slen  = rlen;
+		rpref = opt->dccpop_val;
+		rlen  = opt->dccpop_len;
+	} else {
+		spref = opt->dccpop_val;
+		slen  = opt->dccpop_len;
+	}
 	/*
-	 * Other features: add cases for new feature types here after adding
-	 * them to the above table.
+	 * Now we have server preference list in spref and client preference in
+	 * rpref
 	 */
-	switch (feat_num) {
-	case DCCPF_SEND_LEV_RATE:
-			return DCCP_FEAT_SUPPORTED_MAX - 1;
-	}
-	return -1;
-}
-
-static u8 dccp_feat_type(u8 feat_num)
-{
-	int idx = dccp_feat_index(feat_num);
-
-	if (idx < 0)
-		return FEAT_UNKNOWN;
-	return dccp_feat_table[idx].reconciliation;
-}
+	BUG_ON(spref == NULL);
+	BUG_ON(rpref == NULL);
 
-static int dccp_feat_default_value(u8 feat_num)
-{
-	int idx = dccp_feat_index(feat_num);
+	/* FIXME sanity check vals */
 
-	return idx < 0 ? : dccp_feat_table[idx].default_value;
-}
-
-/*
- *	Debugging and verbose-printing section
- */
-static const char *dccp_feat_fname(const u8 feat)
-{
-	static const char *feature_names[] = {
-		[DCCPF_RESERVED]	= "Reserved",
-		[DCCPF_CCID]		= "CCID",
-		[DCCPF_SHORT_SEQNOS]	= "Allow Short Seqnos",
-		[DCCPF_SEQUENCE_WINDOW]	= "Sequence Window",
-		[DCCPF_ECN_INCAPABLE]	= "ECN Incapable",
-		[DCCPF_ACK_RATIO]	= "Ack Ratio",
-		[DCCPF_SEND_ACK_VECTOR]	= "Send ACK Vector",
-		[DCCPF_SEND_NDP_COUNT]	= "Send NDP Count",
-		[DCCPF_MIN_CSUM_COVER]	= "Min. Csum Coverage",
-		[DCCPF_DATA_CHECKSUM]	= "Send Data Checksum",
-	};
-	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
-		return feature_names[DCCPF_RESERVED];
-
-	if (feat ==  DCCPF_SEND_LEV_RATE)
-		return "Send Loss Event Rate";
-	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
-		return "CCID-specific";
-
-	return feature_names[feat];
-}
-
-static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING",
-					 "UNSTABLE", "STABLE" };
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-static const char *dccp_feat_oname(const u8 opt)
-{
-	switch (opt) {
-	case DCCPO_CHANGE_L:  return "Change_L";
-	case DCCPO_CONFIRM_L: return "Confirm_L";
-	case DCCPO_CHANGE_R:  return "Change_R";
-	case DCCPO_CONFIRM_R: return "Confirm_R";
+	/* Are values in any order?  XXX Lame "algorithm" here */
+	for (i = 0; i < slen; i++) {
+		for (j = 0; j < rlen; j++) {
+			if (spref[i] == rpref[j]) {
+				res = &spref[i];
+				break;
+			}
+		}
+		if (res)
+			break;
 	}
-	return NULL;
-}
 
-static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
-{
-	u8 i, type = dccp_feat_type(feat_num);
-
-	if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
-		dccp_pr_debug_cat("(NULL)");
-	else if (type == FEAT_SP)
-		for (i = 0; i < val->sp.len; i++)
-			dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
-	else if (type == FEAT_NN)
-		dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
-	else
-		dccp_pr_debug_cat("unknown type %u", type);
-}
-
-static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
-{
-	u8 type = dccp_feat_type(feat_num);
-	dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
-
-	if (type == FEAT_NN)
-		fval.nn = dccp_decode_value_var(list, len);
-	dccp_feat_printval(feat_num, &fval);
-}
+	/* we didn't agree on anything */
+	if (res == NULL) {
+		/* confirm previous value */
+		switch (opt->dccpop_feat) {
+		case DCCPF_CCID:
+			/* XXX did i get this right? =P */
+			if (opt->dccpop_type == DCCPO_CHANGE_L)
+				res = &dccp_msk(sk)->dccpms_tx_ccid;
+			else
+				res = &dccp_msk(sk)->dccpms_rx_ccid;
+			break;
 
-static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
-{
-	dccp_debug("   * %s %s = ", entry->is_local ? "local" : "remote",
-				    dccp_feat_fname(entry->feat_num));
-	dccp_feat_printval(entry->feat_num, &entry->val);
-	dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
-			  entry->needs_confirm ? "(Confirm pending)" : "");
-}
+		default:
+			DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat);
+			/* XXX implement res */
+			return -EFAULT;
+		}
 
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory)	do {	      \
-	dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
-	dccp_feat_printvals(feat, val, len);				      \
-	dccp_pr_debug_cat(") %s\n", mandatory ? "!" : "");	} while (0)
-
-#define dccp_feat_print_fnlist(fn_list)  {		\
-	const struct dccp_feat_entry *___entry;		\
-							\
-	dccp_pr_debug("List Dump:\n");			\
-	list_for_each_entry(___entry, fn_list, node)	\
-		dccp_feat_print_entry(___entry);	\
-}
-#else	/* ! CONFIG_IP_DCCP_DEBUG */
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
-#define dccp_feat_print_fnlist(fn_list)
-#endif
+		dccp_pr_debug("Don't agree... reconfirming %d\n", *res);
+		agree = 0; /* this is used for mandatory options... */
+	}
 
-static int __dccp_feat_activate(struct sock *sk, const int idx,
-				const bool is_local, dccp_feat_val const *fval)
-{
-	bool rx;
-	u64 val;
+	/* need to put result and our preference list */
+	rlen = 1 + opt->dccpop_len;
+	rpref = kmalloc(rlen, GFP_ATOMIC);
+	if (rpref == NULL)
+		return -ENOMEM;
 
-	if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
-		return -1;
-	if (dccp_feat_table[idx].activation_hdlr == NULL)
-		return 0;
+	*rpref = *res;
+	memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len);
 
-	if (fval == NULL) {
-		val = dccp_feat_table[idx].default_value;
-	} else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
-		if (fval->sp.vec == NULL) {
-			/*
-			 * This can happen when an empty Confirm is sent
-			 * for an SP (i.e. known) feature. In this case
-			 * we would be using the default anyway.
-			 */
-			DCCP_CRIT("Feature #%d undefined: using default", idx);
-			val = dccp_feat_table[idx].default_value;
-		} else {
-			val = fval->sp.vec[0];
+	/* put it in the "confirm queue" */
+	if (opt->dccpop_sc == NULL) {
+		opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC);
+		if (opt->dccpop_sc == NULL) {
+			kfree(rpref);
+			return -ENOMEM;
 		}
 	} else {
-		val = fval->nn;
+		/* recycle the confirm slot */
+		BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
+		kfree(opt->dccpop_sc->dccpoc_val);
+		dccp_pr_debug("recycling confirm slot\n");
+	}
+	memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
+
+	opt->dccpop_sc->dccpoc_val = rpref;
+	opt->dccpop_sc->dccpoc_len = rlen;
+
+	/* update the option on our side [we are about to send the confirm] */
+	rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
+	if (rc) {
+		kfree(opt->dccpop_sc->dccpoc_val);
+		kfree(opt->dccpop_sc);
+		opt->dccpop_sc = NULL;
+		return rc;
 	}
 
-	/* Location is RX if this is a local-RX or remote-TX feature */
-	rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
-
-	dccp_debug("   -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
-		   dccp_feat_fname(dccp_feat_table[idx].feat_num),
-		   fval ? "" : "default ",  (unsigned long long)val);
-
-	return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
-}
-
-/**
- * dccp_feat_activate  -  Activate feature value on socket
- * @sk: fully connected DCCP socket (after handshake is complete)
- * @feat_num: feature to activate, one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @fval: the value (SP or NN) to activate, or NULL to use the default value
- * For general use this function is preferable over __dccp_feat_activate().
- */
-static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
-			      dccp_feat_val const *fval)
-{
-	return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
-}
-
-/* Test for "Req'd" feature (RFC 4340, 6.4) */
-static inline int dccp_feat_must_be_understood(u8 feat_num)
-{
-	return	feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
-		feat_num == DCCPF_SEQUENCE_WINDOW;
-}
+	dccp_pr_debug("Will confirm %d\n", *rpref);
 
-/* copy constructor, fval must not already contain allocated memory */
-static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
-{
-	fval->sp.len = len;
-	if (fval->sp.len > 0) {
-		fval->sp.vec = kmemdup(val, len, gfp_any());
-		if (fval->sp.vec == NULL) {
-			fval->sp.len = 0;
-			return -ENOBUFS;
-		}
+	/* say we want to change to X but we just got a confirm X, suppress our
+	 * change
+	 */
+	if (!opt->dccpop_conf) {
+		if (*opt->dccpop_val == *res)
+			opt->dccpop_conf = 1;
+		dccp_pr_debug("won't ask for change of same feature\n");
 	}
-	return 0;
-}
 
-static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
-{
-	if (unlikely(val == NULL))
-		return;
-	if (dccp_feat_type(feat_num) == FEAT_SP)
-		kfree(val->sp.vec);
-	memset(val, 0, sizeof(*val));
+	return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */
 }
 
-static struct dccp_feat_entry *
-	      dccp_feat_clone_entry(struct dccp_feat_entry const *original)
+static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
-	struct dccp_feat_entry *new;
-	u8 type = dccp_feat_type(original->feat_num);
-
-	if (type == FEAT_UNKNOWN)
-		return NULL;
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	struct dccp_opt_pend *opt;
+	int rc = 1;
+	u8 t;
 
-	new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
-	if (new == NULL)
-		return NULL;
+	/*
+	 * We received a CHANGE.  We gotta match it against our own preference
+	 * list.  If we got a CHANGE_R it means it's a change for us, so we need
+	 * to compare our CHANGE_L list.
+	 */
+	if (type == DCCPO_CHANGE_L)
+		t = DCCPO_CHANGE_R;
+	else
+		t = DCCPO_CHANGE_L;
 
-	if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
-						      original->val.sp.vec,
-						      original->val.sp.len)) {
-		kfree(new);
-		return NULL;
-	}
-	return new;
-}
+	/* find our preference list for this feature */
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		if (opt->dccpop_type != t || opt->dccpop_feat != feature)
+			continue;
 
-static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
-{
-	if (entry != NULL) {
-		dccp_feat_val_destructor(entry->feat_num, &entry->val);
-		kfree(entry);
+		/* find the winner from the two preference lists */
+		rc = dccp_feat_reconcile(sk, opt, val, len);
+		break;
 	}
-}
 
-/*
- * List management functions
- *
- * Feature negotiation lists rely on and maintain the following invariants:
- * - each feat_num in the list is known, i.e. we know its type and default value
- * - each feat_num/is_local combination is unique (old entries are overwritten)
- * - SP values are always freshly allocated
- * - list is sorted in increasing order of feature number (faster lookup)
- */
-static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
-						     u8 feat_num, bool is_local)
-{
-	struct dccp_feat_entry *entry;
+	/* We didn't deal with the change.  This can happen if we have no
+	 * preference list for the feature.  In fact, it just shouldn't
+	 * happen---if we understand a feature, we should have a preference list
+	 * with at least the default value.
+	 */
+	BUG_ON(rc == 1);
 
-	list_for_each_entry(entry, fn_list, node)
-		if (entry->feat_num == feat_num && entry->is_local == is_local)
-			return entry;
-		else if (entry->feat_num > feat_num)
-			break;
-	return NULL;
+	return rc;
 }
 
-/**
- * dccp_feat_entry_new  -  Central list update routine (called by all others)
- * @head:  list to add to
- * @feat:  feature number
- * @local: whether the local (1) or remote feature with number @feat is meant
- * This is the only constructor and serves to ensure the above invariants.
- */
-static struct dccp_feat_entry *
-	      dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
+static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
-	struct dccp_feat_entry *entry;
-
-	list_for_each_entry(entry, head, node)
-		if (entry->feat_num == feat && entry->is_local == local) {
-			dccp_feat_val_destructor(entry->feat_num, &entry->val);
-			return entry;
-		} else if (entry->feat_num > feat) {
-			head = &entry->node;
-			break;
-		}
+	struct dccp_opt_pend *opt;
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	u8 *copy;
+	int rc;
 
-	entry = kmalloc(sizeof(*entry), gfp_any());
-	if (entry != NULL) {
-		entry->feat_num = feat;
-		entry->is_local = local;
-		list_add_tail(&entry->node, head);
+	/* NN features must be Change L (sec. 6.3.2) */
+	if (type != DCCPO_CHANGE_L) {
+		dccp_pr_debug("received %s for NN feature %d\n",
+				dccp_feat_typename(type), feature);
+		return -EFAULT;
 	}
-	return entry;
-}
 
-/**
- * dccp_feat_push_change  -  Add/overwrite a Change option in the list
- * @fn_list: feature-negotiation list to update
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @needs_mandatory: whether to use Mandatory feature negotiation options
- * @fval: pointer to NN/SP value to be inserted (will be copied)
- */
-static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
-				 u8 mandatory, dccp_feat_val *fval)
-{
-	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+	/* XXX sanity check opt val */
 
-	if (new == NULL)
+	/* copy option so we can confirm it */
+	opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
+	if (opt == NULL)
 		return -ENOMEM;
 
-	new->feat_num	     = feat;
-	new->is_local	     = local;
-	new->state	     = FEAT_INITIALISING;
-	new->needs_confirm   = 0;
-	new->empty_confirm   = 0;
-	new->val	     = *fval;
-	new->needs_mandatory = mandatory;
+	copy = kmemdup(val, len, GFP_ATOMIC);
+	if (copy == NULL) {
+		kfree(opt);
+		return -ENOMEM;
+	}
 
-	return 0;
-}
+	opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */
+	opt->dccpop_feat = feature;
+	opt->dccpop_val	 = copy;
+	opt->dccpop_len	 = len;
 
-/**
- * dccp_feat_push_confirm  -  Add a Confirm entry to the FN list
- * @fn_list: feature-negotiation list to add to
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is being confirmed
- * @fval: pointer to NN/SP value to be inserted or NULL
- * Returns 0 on success, a Reset code for further processing otherwise.
- */
-static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
-				  dccp_feat_val *fval)
-{
-	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+	/* change feature */
+	rc = dccp_feat_update(sk, type, feature, *val);
+	if (rc) {
+		kfree(opt->dccpop_val);
+		kfree(opt);
+		return rc;
+	}
 
-	if (new == NULL)
-		return DCCP_RESET_CODE_TOO_BUSY;
+	dccp_feat_debug(type, feature, *copy);
 
-	new->feat_num	     = feat;
-	new->is_local	     = local;
-	new->state	     = FEAT_STABLE;	/* transition in 6.6.2 */
-	new->needs_confirm   = 1;
-	new->empty_confirm   = (fval == NULL);
-	new->val.nn	     = 0;		/* zeroes the whole structure */
-	if (!new->empty_confirm)
-		new->val     = *fval;
-	new->needs_mandatory = 0;
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
 
 	return 0;
 }
 
-static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
+static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
+				    u8 type, u8 feature)
 {
-	return dccp_feat_push_confirm(fn_list, feat, local, NULL);
-}
+	/* XXX check if other confirms for that are queued and recycle slot */
+	struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
 
-static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
-{
-	list_del(&entry->node);
-	dccp_feat_entry_destructor(entry);
-}
-
-void dccp_feat_list_purge(struct list_head *fn_list)
-{
-	struct dccp_feat_entry *entry, *next;
-
-	list_for_each_entry_safe(entry, next, fn_list, node)
-		dccp_feat_entry_destructor(entry);
-	INIT_LIST_HEAD(fn_list);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
-
-/* generate @to as full clone of @from - @to must not contain any nodes */
-int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
-{
-	struct dccp_feat_entry *entry, *new;
-
-	INIT_LIST_HEAD(to);
-	list_for_each_entry(entry, from, node) {
-		new = dccp_feat_clone_entry(entry);
-		if (new == NULL)
-			goto cloning_failed;
-		list_add_tail(&new->node, to);
+	if (opt == NULL) {
+		/* XXX what do we do?  Ignoring should be fine.  It's a change
+		 * after all =P
+		 */
+		return;
 	}
-	return 0;
 
-cloning_failed:
-	dccp_feat_list_purge(to);
-	return -ENOMEM;
-}
+	switch (type) {
+	case DCCPO_CHANGE_L:
+		opt->dccpop_type = DCCPO_CONFIRM_R;
+		break;
+	case DCCPO_CHANGE_R:
+		opt->dccpop_type = DCCPO_CONFIRM_L;
+		break;
+	default:
+		DCCP_WARN("invalid type %d\n", type);
+		kfree(opt);
+		return;
+	}
+	opt->dccpop_feat = feature;
+	opt->dccpop_val	 = NULL;
+	opt->dccpop_len	 = 0;
 
-/**
- * dccp_feat_valid_nn_length  -  Enforce length constraints on NN options
- * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
- * incoming options are accepted as long as their values are valid.
- */
-static u8 dccp_feat_valid_nn_length(u8 feat_num)
-{
-	if (feat_num == DCCPF_ACK_RATIO)	/* RFC 4340, 11.3 and 6.6.8 */
-		return 2;
-	if (feat_num == DCCPF_SEQUENCE_WINDOW)	/* RFC 4340, 7.5.2 and 6.5  */
-		return 6;
-	return 0;
-}
+	/* change feature */
+	dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature);
 
-static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
-{
-	switch (feat_num) {
-	case DCCPF_ACK_RATIO:
-		return val <= DCCPF_ACK_RATIO_MAX;
-	case DCCPF_SEQUENCE_WINDOW:
-		return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
-	}
-	return 0;	/* feature unknown - so we can't tell */
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
 }
 
-/* check that SP values are within the ranges defined in RFC 4340 */
-static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
+static void dccp_feat_flush_confirm(struct sock *sk)
 {
-	switch (feat_num) {
-	case DCCPF_CCID:
-		return val == DCCPC_CCID2 || val == DCCPC_CCID3;
-	/* Type-check Boolean feature values: */
-	case DCCPF_SHORT_SEQNOS:
-	case DCCPF_ECN_INCAPABLE:
-	case DCCPF_SEND_ACK_VECTOR:
-	case DCCPF_SEND_NDP_COUNT:
-	case DCCPF_DATA_CHECKSUM:
-	case DCCPF_SEND_LEV_RATE:
-		return val < 2;
-	case DCCPF_MIN_CSUM_COVER:
-		return val < 16;
-	}
-	return 0;			/* feature unknown */
-}
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	/* Check if there is anything to confirm in the first place */
+	int yes = !list_empty(&dmsk->dccpms_conf);
 
-static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
-{
-	if (sp_list == NULL || sp_len < 1)
-		return 0;
-	while (sp_len--)
-		if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
-			return 0;
-	return 1;
-}
+	if (!yes) {
+		struct dccp_opt_pend *opt;
 
-/**
- * dccp_feat_insert_opts  -  Generate FN options from current list state
- * @skb: next sk_buff to be sent to the peer
- * @dp: for client during handshake and general negotiation
- * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
- */
-int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
-			  struct sk_buff *skb)
-{
-	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
-	struct dccp_feat_entry *pos, *next;
-	u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
-	bool rpt;
-
-	/* put entries into @skb in the order they appear in the list */
-	list_for_each_entry_safe_reverse(pos, next, fn, node) {
-		opt  = dccp_feat_genopt(pos);
-		type = dccp_feat_type(pos->feat_num);
-		rpt  = false;
-
-		if (pos->empty_confirm) {
-			len = 0;
-			ptr = NULL;
-		} else {
-			if (type == FEAT_SP) {
-				len = pos->val.sp.len;
-				ptr = pos->val.sp.vec;
-				rpt = pos->needs_confirm;
-			} else if (type == FEAT_NN) {
-				len = dccp_feat_valid_nn_length(pos->feat_num);
-				ptr = nn_in_nbo;
-				dccp_encode_value_var(pos->val.nn, ptr, len);
-			} else {
-				DCCP_BUG("unknown feature %u", pos->feat_num);
-				return -1;
+		list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+			if (opt->dccpop_conf) {
+				yes = 1;
+				break;
 			}
 		}
-		dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
-
-		if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
-			return -1;
-		if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
-			return -1;
-		/*
-		 * Enter CHANGING after transmitting the Change option (6.6.2).
-		 */
-		if (pos->state == FEAT_INITIALISING)
-			pos->state = FEAT_CHANGING;
 	}
-	return 0;
-}
-
-/**
- * __feat_register_nn  -  Register new NN value on socket
- * @fn: feature-negotiation list to register with
- * @feat: an NN feature from %dccp_feature_numbers
- * @mandatory: use Mandatory option if 1
- * @nn_val: value to register (restricted to 4 bytes)
- * Note that NN features are local by definition (RFC 4340, 6.3.2).
- */
-static int __feat_register_nn(struct list_head *fn, u8 feat,
-			      u8 mandatory, u64 nn_val)
-{
-	dccp_feat_val fval = { .nn = nn_val };
-
-	if (dccp_feat_type(feat) != FEAT_NN ||
-	    !dccp_feat_is_valid_nn_val(feat, nn_val))
-		return -EINVAL;
-
-	/* Don't bother with default values, they will be activated anyway. */
-	if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
-		return 0;
-
-	return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
-}
-
-/**
- * __feat_register_sp  -  Register new SP value/list on socket
- * @fn: feature-negotiation list to register with
- * @feat: an SP feature from %dccp_feature_numbers
- * @is_local: whether the local (1) or the remote (0) @feat is meant
- * @mandatory: use Mandatory option if 1
- * @sp_val: SP value followed by optional preference list
- * @sp_len: length of @sp_val in bytes
- */
-static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
-			      u8 mandatory, u8 const *sp_val, u8 sp_len)
-{
-	dccp_feat_val fval;
 
-	if (dccp_feat_type(feat) != FEAT_SP ||
-	    !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
-		return -EINVAL;
-
-	/* Avoid negotiating alien CCIDs by only advertising supported ones */
-	if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
-		return -EOPNOTSUPP;
-
-	if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
-		return -ENOMEM;
+	if (!yes)
+		return;
 
-	return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
+	/* OK there is something to confirm... */
+	/* XXX check if packet is in flight?  Send delayed ack?? */
+	if (sk->sk_state == DCCP_OPEN)
+		dccp_send_ack(sk);
 }
 
-/**
- * dccp_feat_register_sp  -  Register requests to change SP feature values
- * @sk: client or listening socket
- * @feat: one of %dccp_feature_numbers
- * @is_local: whether the local (1) or remote (0) @feat is meant
- * @list: array of preferred values, in descending order of preference
- * @len: length of @list in bytes
- */
-int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
-			  u8 const *list, u8 len)
-{	 /* any changes must be registered before establishing the connection */
-	if (sk->sk_state != DCCP_CLOSED)
-		return -EISCONN;
-	if (dccp_feat_type(feat) != FEAT_SP)
-		return -EINVAL;
-	return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
-				  0, list, len);
-}
-
-/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
-int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
+int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
-	/* any changes must be registered before establishing the connection */
-	if (sk->sk_state != DCCP_CLOSED)
-		return -EISCONN;
-	if (dccp_feat_type(feat) != FEAT_NN)
-		return -EINVAL;
-	return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
-}
+	int rc;
 
-/**
- * dccp_feat_signal_nn_change  -  Update NN values for an established connection
- * @sk: DCCP socket of an established connection
- * @feat: NN feature number from %dccp_feature_numbers
- * @nn_val: the new value to use
- * This function is used to communicate NN updates out-of-band. The difference
- * to feature negotiation during connection setup is that values are activated
- * immediately after validation, i.e. we don't wait for the Confirm: either the
- * value is accepted by the peer (and then the waiting is futile), or it is not
- * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values
- * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2).
- */
-int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
-{
-	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
-	dccp_feat_val fval = { .nn = nn_val };
-	struct dccp_feat_entry *entry;
+	dccp_feat_debug(type, feature, *val);
 
-	if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
-		return 0;
+	/* figure out if it's SP or NN feature */
+	switch (feature) {
+	/* deal with SP features */
+	case DCCPF_CCID:
+		rc = dccp_feat_sp(sk, type, feature, val, len);
+		break;
 
-	if (dccp_feat_type(feat) != FEAT_NN ||
-	    !dccp_feat_is_valid_nn_val(feat, nn_val))
-		return -EINVAL;
+	/* deal with NN features */
+	case DCCPF_ACK_RATIO:
+		rc = dccp_feat_nn(sk, type, feature, val, len);
+		break;
 
-	entry = dccp_feat_list_lookup(fn, feat, 1);
-	if (entry != NULL) {
-		dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n",
-			      (unsigned long long)nn_val,
-			      (unsigned long long)entry->val.nn,
-			      dccp_feat_sname[entry->state]);
-		return 0;
+	/* XXX implement other features */
+	default:
+		dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n",
+			      dccp_feat_typename(type), feature);
+		rc = -EFAULT;
+		break;
 	}
 
-	if (dccp_feat_activate(sk, feat, 1, &fval))
-		return -EADV;
-
-	inet_csk_schedule_ack(sk);
-	return dccp_feat_push_change(fn, feat, 1, 0, &fval);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
-
-/*
- *	Tracking features whose value depend on the choice of CCID
- *
- * This is designed with an extension in mind so that a list walk could be done
- * before activating any features. However, the existing framework was found to
- * work satisfactorily up until now, the automatic verification is left open.
- * When adding new CCIDs, add a corresponding dependency table here.
- */
-static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
-{
-	static const struct ccid_dependency ccid2_dependencies[2][2] = {
-		/*
-		 * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
-		 * feature and Send Ack Vector is an RX feature, `is_local'
-		 * needs to be reversed.
+	/* check if there were problems changing features */
+	if (rc) {
+		/* If we don't agree on SP, we sent a confirm for old value.
+		 * However we propagate rc to caller in case option was
+		 * mandatory
 		 */
-		{	/* Dependencies of the receiver-side (remote) CCID2 */
-			{
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= true,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 }
-		},
-		{	/* Dependencies of the sender-side (local) CCID2 */
-			{
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= false,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 }
-		}
-	};
-	static const struct ccid_dependency ccid3_dependencies[2][5] = {
-		{	/*
-			 * Dependencies of the receiver-side CCID3
-			 */
-			{	/* locally disable Ack Vectors */
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= true,
-				.is_mandatory	= false,
-				.val		= 0
-			},
-			{	/* see below why Send Loss Event Rate is on */
-				.dependent_feat	= DCCPF_SEND_LEV_RATE,
-				.is_local	= true,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{	/* NDP Count is needed as per RFC 4342, 6.1.1 */
-				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
-				.is_local	= false,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 },
-		},
-		{	/*
-			 * CCID3 at the TX side: we request that the HC-receiver
-			 * will not send Ack Vectors (they will be ignored, so
-			 * Mandatory is not set); we enable Send Loss Event Rate
-			 * (Mandatory since the implementation does not support
-			 * the Loss Intervals option of RFC 4342, 8.6).
-			 * The last two options are for peer's information only.
-			*/
-			{
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= false,
-				.is_mandatory	= false,
-				.val		= 0
-			},
-			{
-				.dependent_feat	= DCCPF_SEND_LEV_RATE,
-				.is_local	= false,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{	/* this CCID does not support Ack Ratio */
-				.dependent_feat	= DCCPF_ACK_RATIO,
-				.is_local	= true,
-				.is_mandatory	= false,
-				.val		= 0
-			},
-			{	/* tell receiver we are sending NDP counts */
-				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
-				.is_local	= true,
-				.is_mandatory	= false,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 }
-		}
-	};
-	switch (ccid) {
-	case DCCPC_CCID2:
-		return ccid2_dependencies[is_local];
-	case DCCPC_CCID3:
-		return ccid3_dependencies[is_local];
-	default:
-		return NULL;
+		if (rc != DCCP_FEAT_SP_NOAGREE)
+			dccp_feat_empty_confirm(dccp_msk(sk), type, feature);
 	}
-}
 
-/**
- * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
- * @fn: feature-negotiation list to update
- * @id: CCID number to track
- * @is_local: whether TX CCID (1) or RX CCID (0) is meant
- * This function needs to be called after registering all other features.
- */
-static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
-{
-	const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
-	int i, rc = (table == NULL);
-
-	for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
-		if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
-			rc = __feat_register_sp(fn, table[i].dependent_feat,
-						    table[i].is_local,
-						    table[i].is_mandatory,
-						    &table[i].val, 1);
-		else
-			rc = __feat_register_nn(fn, table[i].dependent_feat,
-						    table[i].is_mandatory,
-						    table[i].val);
-	return rc;
-}
-
-/**
- * dccp_feat_finalise_settings  -  Finalise settings before starting negotiation
- * @dp: client or listening socket (settings will be inherited)
- * This is called after all registrations (socket initialisation, sysctls, and
- * sockopt calls), and before sending the first packet containing Change options
- * (ie. client-Request or server-Response), to ensure internal consistency.
- */
-int dccp_feat_finalise_settings(struct dccp_sock *dp)
-{
-	struct list_head *fn = &dp->dccps_featneg;
-	struct dccp_feat_entry *entry;
-	int i = 2, ccids[2] = { -1, -1 };
+	/* generate the confirm [if required] */
+	dccp_feat_flush_confirm(sk);
 
-	/*
-	 * Propagating CCIDs:
-	 * 1) not useful to propagate CCID settings if this host advertises more
-	 *    than one CCID: the choice of CCID  may still change - if this is
-	 *    the client, or if this is the server and the client sends
-	 *    singleton CCID values.
-	 * 2) since is that propagate_ccid changes the list, we defer changing
-	 *    the sorted list until after the traversal.
-	 */
-	list_for_each_entry(entry, fn, node)
-		if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
-			ccids[entry->is_local] = entry->val.sp.vec[0];
-	while (i--)
-		if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
-			return -1;
-	dccp_feat_print_fnlist(fn);
-	return 0;
+	return rc;
 }
 
-/**
- * dccp_feat_server_ccid_dependencies  -  Resolve CCID-dependent features
- * It is the server which resolves the dependencies once the CCID has been
- * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
- */
-int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
-{
-	struct list_head *fn = &dreq->dreq_featneg;
-	struct dccp_feat_entry *entry;
-	u8 is_local, ccid;
-
-	for (is_local = 0; is_local <= 1; is_local++) {
-		entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
-
-		if (entry != NULL && !entry->empty_confirm)
-			ccid = entry->val.sp.vec[0];
-		else
-			ccid = dccp_feat_default_value(DCCPF_CCID);
-
-		if (dccp_feat_propagate_ccid(fn, ccid, is_local))
-			return -1;
-	}
-	return 0;
-}
+EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
 
-/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
-static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
+int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
+			   u8 *val, u8 len)
 {
-	u8 c, s;
+	u8 t;
+	struct dccp_opt_pend *opt;
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	int found = 0;
+	int all_confirmed = 1;
 
-	for (s = 0; s < slen; s++)
-		for (c = 0; c < clen; c++)
-			if (servlist[s] == clilist[c])
-				return servlist[s];
-	return -1;
-}
+	dccp_feat_debug(type, feature, *val);
 
-/**
- * dccp_feat_prefer  -  Move preferred entry to the start of array
- * Reorder the @array_len elements in @array so that @preferred_value comes
- * first. Returns >0 to indicate that @preferred_value does occur in @array.
- */
-static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
-{
-	u8 i, does_occur = 0;
+	/* locate our change request */
+	switch (type) {
+	case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break;
+	case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break;
+	default:	      DCCP_WARN("invalid type %d\n", type);
+			      return 1;
 
-	if (array != NULL) {
-		for (i = 0; i < array_len; i++)
-			if (array[i] == preferred_value) {
-				array[i] = array[0];
-				does_occur++;
-			}
-		if (does_occur)
-			array[0] = preferred_value;
 	}
-	return does_occur;
-}
+	/* XXX sanity check feature value */
 
-/**
- * dccp_feat_reconcile  -  Reconcile SP preference lists
- *  @fval: SP list to reconcile into
- *  @arr: received SP preference list
- *  @len: length of @arr in bytes
- *  @is_server: whether this side is the server (and @fv is the server's list)
- *  @reorder: whether to reorder the list in @fv after reconciling with @arr
- * When successful, > 0 is returned and the reconciled list is in @fval.
- * A value of 0 means that negotiation failed (no shared entry).
- */
-static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
-			       bool is_server, bool reorder)
-{
-	int rc;
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		if (!opt->dccpop_conf && opt->dccpop_type == t &&
+		    opt->dccpop_feat == feature) {
+			found = 1;
+			dccp_pr_debug("feature %d found\n", opt->dccpop_feat);
 
-	if (!fv->sp.vec || !arr) {
-		DCCP_CRIT("NULL feature value or array");
-		return 0;
-	}
+			/* XXX do sanity check */
 
-	if (is_server)
-		rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
-	else
-		rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
-
-	if (!reorder)
-		return rc;
-	if (rc < 0)
-		return 0;
+			opt->dccpop_conf = 1;
 
-	/*
-	 * Reorder list: used for activating features and in dccp_insert_fn_opt.
-	 */
-	return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
-}
+			/* We got a confirmation---change the option */
+			dccp_feat_update(sk, opt->dccpop_type,
+					 opt->dccpop_feat, *val);
 
-/**
- * dccp_feat_change_recv  -  Process incoming ChangeL/R options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether the Change was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is the server (1) or the client (0)
- */
-static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
-				u8 feat, u8 *val, u8 len, const bool server)
-{
-	u8 defval, type = dccp_feat_type(feat);
-	const bool local = (opt == DCCPO_CHANGE_R);
-	struct dccp_feat_entry *entry;
-	dccp_feat_val fval;
-
-	if (len == 0 || type == FEAT_UNKNOWN)		/* 6.1 and 6.6.8 */
-		goto unknown_feature_or_value;
-
-	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
-	/*
-	 *	Negotiation of NN features: Change R is invalid, so there is no
-	 *	simultaneous negotiation; hence we do not look up in the list.
-	 */
-	if (type == FEAT_NN) {
-		if (local || len > sizeof(fval.nn))
-			goto unknown_feature_or_value;
-
-		/* 6.3.2: "The feature remote MUST accept any valid value..." */
-		fval.nn = dccp_decode_value_var(val, len);
-		if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
-			goto unknown_feature_or_value;
+			/* XXX check the return value of dccp_feat_update */
+			break;
+		}
 
-		return dccp_feat_push_confirm(fn, feat, local, &fval);
+		if (!opt->dccpop_conf)
+			all_confirmed = 0;
 	}
 
-	/*
-	 *	Unidirectional/simultaneous negotiation of SP features (6.3.1)
+	/* fix re-transmit timer */
+	/* XXX gotta make sure that no option negotiation occurs during
+	 * connection shutdown.  Consider that the CLOSEREQ is sent and timer is
+	 * on.  if all options are confirmed it might kill timer which should
+	 * remain alive until close is received.
 	 */
-	entry = dccp_feat_list_lookup(fn, feat, local);
-	if (entry == NULL) {
-		/*
-		 * No particular preferences have been registered. We deal with
-		 * this situation by assuming that all valid values are equally
-		 * acceptable, and apply the following checks:
-		 * - if the peer's list is a singleton, we accept a valid value;
-		 * - if we are the server, we first try to see if the peer (the
-		 *   client) advertises the default value. If yes, we use it,
-		 *   otherwise we accept the preferred value;
-		 * - else if we are the client, we use the first list element.
-		 */
-		if (dccp_feat_clone_sp_val(&fval, val, 1))
-			return DCCP_RESET_CODE_TOO_BUSY;
-
-		if (len > 1 && server) {
-			defval = dccp_feat_default_value(feat);
-			if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
-				fval.sp.vec[0] = defval;
-		} else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
-			kfree(fval.sp.vec);
-			goto unknown_feature_or_value;
-		}
-
-		/* Treat unsupported CCIDs like invalid values */
-		if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
-			kfree(fval.sp.vec);
-			goto not_valid_or_not_known;
-		}
-
-		return dccp_feat_push_confirm(fn, feat, local, &fval);
-
-	} else if (entry->state == FEAT_UNSTABLE) {	/* 6.6.2 */
-		return 0;
+	if (all_confirmed) {
+		dccp_pr_debug("clear feat negotiation timer %p\n", sk);
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	}
 
-	if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
-		entry->empty_confirm = 0;
-	} else if (is_mandatory) {
-		return DCCP_RESET_CODE_MANDATORY_ERROR;
-	} else if (entry->state == FEAT_INITIALISING) {
-		/*
-		 * Failed simultaneous negotiation (server only): try to `save'
-		 * the connection by checking whether entry contains the default
-		 * value for @feat. If yes, send an empty Confirm to signal that
-		 * the received Change was not understood - which implies using
-		 * the default value.
-		 * If this also fails, we use Reset as the last resort.
-		 */
-		WARN_ON(!server);
-		defval = dccp_feat_default_value(feat);
-		if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
-			return DCCP_RESET_CODE_OPTION_ERROR;
-		entry->empty_confirm = 1;
-	}
-	entry->needs_confirm   = 1;
-	entry->needs_mandatory = 0;
-	entry->state	       = FEAT_STABLE;
+	if (!found)
+		dccp_pr_debug("%s(%d, ...) never requested\n",
+			      dccp_feat_typename(type), feature);
 	return 0;
-
-unknown_feature_or_value:
-	if (!is_mandatory)
-		return dccp_push_empty_confirm(fn, feat, local);
-
-not_valid_or_not_known:
-	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
-			    : DCCP_RESET_CODE_OPTION_ERROR;
 }
 
-/**
- * dccp_feat_confirm_recv  -  Process received Confirm options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is server (1) or client (0)
- */
-static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
-				 u8 feat, u8 *val, u8 len, const bool server)
-{
-	u8 *plist, plen, type = dccp_feat_type(feat);
-	const bool local = (opt == DCCPO_CONFIRM_R);
-	struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
-
-	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
-	if (entry == NULL) {	/* nothing queued: ignore or handle error */
-		if (is_mandatory && type == FEAT_UNKNOWN)
-			return DCCP_RESET_CODE_MANDATORY_ERROR;
-
-		if (!local && type == FEAT_NN)		/* 6.3.2 */
-			goto confirmation_failed;
-		return 0;
-	}
-
-	if (entry->state != FEAT_CHANGING)		/* 6.6.2 */
-		return 0;
-
-	if (len == 0) {
-		if (dccp_feat_must_be_understood(feat))	/* 6.6.7 */
-			goto confirmation_failed;
-		/*
-		 * Empty Confirm during connection setup: this means reverting
-		 * to the `old' value, which in this case is the default. Since
-		 * we handle default values automatically when no other values
-		 * have been set, we revert to the old value by removing this
-		 * entry from the list.
-		 */
-		dccp_feat_list_pop(entry);
-		return 0;
-	}
+EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
 
-	if (type == FEAT_NN) {
-		if (len > sizeof(entry->val.nn))
-			goto confirmation_failed;
+void dccp_feat_clean(struct dccp_minisock *dmsk)
+{
+	struct dccp_opt_pend *opt, *next;
 
-		if (entry->val.nn == dccp_decode_value_var(val, len))
-			goto confirmation_succeeded;
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending,
+				 dccpop_node) {
+		BUG_ON(opt->dccpop_val == NULL);
+		kfree(opt->dccpop_val);
 
-		DCCP_WARN("Bogus Confirm for non-existing value\n");
-		goto confirmation_failed;
-	}
+		if (opt->dccpop_sc != NULL) {
+			BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
+			kfree(opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc);
+		}
 
-	/*
-	 * Parsing SP Confirms: the first element of @val is the preferred
-	 * SP value which the peer confirms, the remainder depends on @len.
-	 * Note that only the confirmed value need to be a valid SP value.
-	 */
-	if (!dccp_feat_is_valid_sp_val(feat, *val))
-		goto confirmation_failed;
-
-	if (len == 1) {		/* peer didn't supply a preference list */
-		plist = val;
-		plen  = len;
-	} else {		/* preferred value + preference list */
-		plist = val + 1;
-		plen  = len - 1;
+		kfree(opt);
 	}
+	INIT_LIST_HEAD(&dmsk->dccpms_pending);
 
-	/* Check whether the peer got the reconciliation right (6.6.8) */
-	if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
-		DCCP_WARN("Confirm selected the wrong value %u\n", *val);
-		return DCCP_RESET_CODE_OPTION_ERROR;
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
+		BUG_ON(opt == NULL);
+		if (opt->dccpop_val != NULL)
+			kfree(opt->dccpop_val);
+		kfree(opt);
 	}
-	entry->val.sp.vec[0] = *val;
-
-confirmation_succeeded:
-	entry->state = FEAT_STABLE;
-	return 0;
-
-confirmation_failed:
-	DCCP_WARN("Confirmation failed\n");
-	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
-			    : DCCP_RESET_CODE_OPTION_ERROR;
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
 }
 
-/**
- * dccp_feat_handle_nn_established  -  Fast-path reception of NN options
- * @sk:		socket of an established DCCP connection
- * @mandatory:	whether @opt was preceded by a Mandatory option
- * @opt:	%DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
- * @feat:	NN number, one of %dccp_feature_numbers
- * @val:	NN value
- * @len:	length of @val in bytes
- * This function combines the functionality of change_recv/confirm_recv, with
- * the following differences (reset codes are the same):
- *    - cleanup after receiving the Confirm;
- *    - values are directly activated after successful parsing;
- *    - deliberately restricted to NN features.
- * The restriction to NN features is essential since SP features can have non-
- * predictable outcomes (depending on the remote configuration), and are inter-
- * dependent (CCIDs for instance cause further dependencies).
+EXPORT_SYMBOL_GPL(dccp_feat_clean);
+
+/* this is to be called only when a listening sock creates its child.  It is
+ * assumed by the function---the confirm is not duplicated, but rather it is
+ * "passed on".
  */
-static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
-					  u8 feat, u8 *val, u8 len)
+int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
 {
-	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
-	const bool local = (opt == DCCPO_CONFIRM_R);
-	struct dccp_feat_entry *entry;
-	u8 type = dccp_feat_type(feat);
-	dccp_feat_val fval;
+	struct dccp_minisock *olddmsk = dccp_msk(oldsk);
+	struct dccp_minisock *newdmsk = dccp_msk(newsk);
+	struct dccp_opt_pend *opt;
+	int rc = 0;
 
-	dccp_feat_print_opt(opt, feat, val, len, mandatory);
+	INIT_LIST_HEAD(&newdmsk->dccpms_pending);
+	INIT_LIST_HEAD(&newdmsk->dccpms_conf);
 
-	/* Ignore non-mandatory unknown and non-NN features */
-	if (type == FEAT_UNKNOWN) {
-		if (local && !mandatory)
-			return 0;
-		goto fast_path_unknown;
-	} else if (type != FEAT_NN) {
-		return 0;
-	}
-
-	/*
-	 * We don't accept empty Confirms, since in fast-path feature
-	 * negotiation the values are enabled immediately after sending
-	 * the Change option.
-	 * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
-	 */
-	if (len == 0 || len > sizeof(fval.nn))
-		goto fast_path_unknown;
-
-	if (opt == DCCPO_CHANGE_L) {
-		fval.nn = dccp_decode_value_var(val, len);
-		if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
-			goto fast_path_unknown;
+	list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) {
+		struct dccp_opt_pend *newopt;
+		/* copy the value of the option */
+		u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC);
 
-		if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
-		    dccp_feat_activate(sk, feat, local, &fval))
-			return DCCP_RESET_CODE_TOO_BUSY;
+		if (val == NULL)
+			goto out_clean;
 
-		/* set the `Ack Pending' flag to piggyback a Confirm */
-		inet_csk_schedule_ack(sk);
-
-	} else if (opt == DCCPO_CONFIRM_R) {
-		entry = dccp_feat_list_lookup(fn, feat, local);
-		if (entry == NULL || entry->state != FEAT_CHANGING)
-			return 0;
-
-		fval.nn = dccp_decode_value_var(val, len);
-		if (fval.nn != entry->val.nn) {
-			DCCP_WARN("Bogus Confirm for non-existing value\n");
-			goto fast_path_failed;
+		newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC);
+		if (newopt == NULL) {
+			kfree(val);
+			goto out_clean;
 		}
 
-		/* It has been confirmed - so remove the entry */
-		dccp_feat_list_pop(entry);
+		/* insert the option */
+		newopt->dccpop_val = val;
+		list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
 
-	} else {
-		DCCP_WARN("Received illegal option %u\n", opt);
-		goto fast_path_failed;
+		/* XXX what happens with backlogs and multiple connections at
+		 * once...
+		 */
+		/* the master socket no longer needs to worry about confirms */
+		opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */
+
+		/* reset state for a new socket */
+		opt->dccpop_conf = 0;
 	}
-	return 0;
 
-fast_path_unknown:
-	if (!mandatory)
-		return dccp_push_empty_confirm(fn, feat, local);
+	/* XXX not doing anything about the conf queue */
+
+out:
+	return rc;
 
-fast_path_failed:
-	return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
-			 : DCCP_RESET_CODE_OPTION_ERROR;
+out_clean:
+	dccp_feat_clean(newdmsk);
+	rc = -ENOMEM;
+	goto out;
 }
 
-/**
- * dccp_feat_parse_options  -  Process Feature-Negotiation Options
- * @sk: for general use and used by the client during connection setup
- * @dreq: used by the server during connection setup
- * @mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: value contents of @opt
- * @len: length of @val in bytes
- * Returns 0 on success, a Reset code for ending the connection otherwise.
- */
-int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
-			    u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
+EXPORT_SYMBOL_GPL(dccp_feat_clone);
+
+static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat,
+			    u8 *val, u8 len)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
-	bool server = false;
+	int rc = -ENOMEM;
+	u8 *copy = kmemdup(val, len, GFP_KERNEL);
 
-	switch (sk->sk_state) {
-	/*
-	 *	Negotiation during connection setup
-	 */
-	case DCCP_LISTEN:
-		server = true;			/* fall through */
-	case DCCP_REQUESTING:
-		switch (opt) {
-		case DCCPO_CHANGE_L:
-		case DCCPO_CHANGE_R:
-			return dccp_feat_change_recv(fn, mandatory, opt, feat,
-						     val, len, server);
-		case DCCPO_CONFIRM_R:
-		case DCCPO_CONFIRM_L:
-			return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
-						      val, len, server);
-		}
-		break;
-	/*
-	 *	Support for exchanging NN options on an established connection
-	 *	This is currently restricted to Ack Ratio (RFC 4341, 6.1.2)
-	 */
-	case DCCP_OPEN:
-	case DCCP_PARTOPEN:
-		return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
-						       val, len);
+	if (copy != NULL) {
+		rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL);
+		if (rc)
+			kfree(copy);
 	}
-	return 0;	/* ignore FN options in all other states */
+	return rc;
 }
 
-/**
- * dccp_feat_init  -  Seed feature negotiation with host-specific defaults
- * This initialises global defaults, depending on the value of the sysctls.
- * These can later be overridden by registering changes via setsockopt calls.
- * The last link in the chain is finalise_settings, to make sure that between
- * here and the start of actual feature negotiation no inconsistencies enter.
- *
- * All features not appearing below use either defaults or are otherwise
- * later adjusted through dccp_feat_finalise_settings().
- */
-int dccp_feat_init(struct sock *sk)
+int dccp_feat_init(struct dccp_minisock *dmsk)
 {
-	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
-	u8 on = 1, off = 0;
 	int rc;
-	struct {
-		u8 *val;
-		u8 len;
-	} tx, rx;
-
-	/* Non-negotiable (NN) features */
-	rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
-				    sysctl_dccp_sequence_window);
-	if (rc)
-		return rc;
 
-	/* Server-priority (SP) features */
-
-	/* Advertise that short seqnos are not supported (7.6.1) */
-	rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
-	if (rc)
-		return rc;
+	INIT_LIST_HEAD(&dmsk->dccpms_pending);
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
 
-	/* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
-	rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+	/* CCID L */
+	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID,
+			      &dmsk->dccpms_tx_ccid, 1);
 	if (rc)
-		return rc;
-
-	/*
-	 * We advertise the available list of CCIDs and reorder according to
-	 * preferences, to avoid failure resulting from negotiating different
-	 * singleton values (which always leads to failure).
-	 * These settings can still (later) be overridden via sockopts.
-	 */
-	if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
-	    ccid_get_builtin_ccids(&rx.val, &rx.len))
-		return -ENOBUFS;
-
-	/* Pre-load all CCID modules that are going to be advertised */
-	rc = -EUNATCH;
-	if (ccid_request_modules(tx.val, tx.len))
-		goto free_ccid_lists;
-
-	if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
-	    !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
-		goto free_ccid_lists;
+		goto out;
 
-	rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
+	/* CCID R */
+	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID,
+			      &dmsk->dccpms_rx_ccid, 1);
 	if (rc)
-		goto free_ccid_lists;
+		goto out;
 
-	rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
-
-free_ccid_lists:
-	kfree(tx.val);
-	kfree(rx.val);
+	/* Ack ratio */
+	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
+			      &dmsk->dccpms_ack_ratio, 1);
+out:
 	return rc;
 }
 
-int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_feat_entry *cur, *next;
-	int idx;
-	dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
-		 [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
-	};
-
-	list_for_each_entry(cur, fn_list, node) {
-		/*
-		 * An empty Confirm means that either an unknown feature type
-		 * or an invalid value was present. In the first case there is
-		 * nothing to activate, in the other the default value is used.
-		 */
-		if (cur->empty_confirm)
-			continue;
+EXPORT_SYMBOL_GPL(dccp_feat_init);
 
-		idx = dccp_feat_index(cur->feat_num);
-		if (idx < 0) {
-			DCCP_BUG("Unknown feature %u", cur->feat_num);
-			goto activation_failed;
-		}
-		if (cur->state != FEAT_STABLE) {
-			DCCP_CRIT("Negotiation of %s %s failed in state %s",
-				  cur->is_local ? "local" : "remote",
-				  dccp_feat_fname(cur->feat_num),
-				  dccp_feat_sname[cur->state]);
-			goto activation_failed;
-		}
-		fvals[idx][cur->is_local] = &cur->val;
+#ifdef CONFIG_IP_DCCP_DEBUG
+const char *dccp_feat_typename(const u8 type)
+{
+	switch(type) {
+	case DCCPO_CHANGE_L:  return("ChangeL");
+	case DCCPO_CONFIRM_L: return("ConfirmL");
+	case DCCPO_CHANGE_R:  return("ChangeR");
+	case DCCPO_CONFIRM_R: return("ConfirmR");
+	/* the following case must not appear in feature negotation  */
+	default:	      dccp_pr_debug("unknown type %d [BUG!]\n", type);
 	}
+	return NULL;
+}
 
-	/*
-	 * Activate in decreasing order of index, so that the CCIDs are always
-	 * activated as the last feature. This avoids the case where a CCID
-	 * relies on the initialisation of one or more features that it depends
-	 * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
-	 */
-	for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
-		if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
-		    __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
-			DCCP_CRIT("Could not activate %d", idx);
-			goto activation_failed;
-		}
+EXPORT_SYMBOL_GPL(dccp_feat_typename);
 
-	/* Clean up Change options which have been confirmed already */
-	list_for_each_entry_safe(cur, next, fn_list, node)
-		if (!cur->needs_confirm)
-			dccp_feat_list_pop(cur);
+const char *dccp_feat_name(const u8 feat)
+{
+	static const char *feature_names[] = {
+		[DCCPF_RESERVED]	= "Reserved",
+		[DCCPF_CCID]		= "CCID",
+		[DCCPF_SHORT_SEQNOS]	= "Allow Short Seqnos",
+		[DCCPF_SEQUENCE_WINDOW]	= "Sequence Window",
+		[DCCPF_ECN_INCAPABLE]	= "ECN Incapable",
+		[DCCPF_ACK_RATIO]	= "Ack Ratio",
+		[DCCPF_SEND_ACK_VECTOR]	= "Send ACK Vector",
+		[DCCPF_SEND_NDP_COUNT]	= "Send NDP Count",
+		[DCCPF_MIN_CSUM_COVER]	= "Min. Csum Coverage",
+		[DCCPF_DATA_CHECKSUM]	= "Send Data Checksum",
+	};
+	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
+		return feature_names[DCCPF_RESERVED];
 
-	dccp_pr_debug("Activation OK\n");
-	return 0;
+	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
+		return "CCID-specific";
 
-activation_failed:
-	/*
-	 * We clean up everything that may have been allocated, since
-	 * it is difficult to track at which stage negotiation failed.
-	 * This is ok, since all allocation functions below are robust
-	 * against NULL arguments.
-	 */
-	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
-	dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
-	dp->dccps_hc_rx_ackvec = NULL;
-	return -1;
+	return feature_names[feat];
 }
+
+EXPORT_SYMBOL_GPL(dccp_feat_name);
+#endif /* CONFIG_IP_DCCP_DEBUG */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 2217066e22d..e272222c7ac 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -3,134 +3,38 @@
 /*
  *  net/dccp/feat.h
  *
- *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *  An implementation of the DCCP protocol
  *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
  *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
  */
+
 #include <linux/types.h>
 #include "dccp.h"
 
-/*
- * Known limit values
- */
-/* Ack Ratio takes 2-byte integer values (11.3) */
-#define DCCPF_ACK_RATIO_MAX	0xFFFF
-/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
-#define DCCPF_SEQ_WMIN		32
-#define DCCPF_SEQ_WMAX		0x3FFFFFFFFFFFull
-/* Maximum number of SP values that fit in a single (Confirm) option */
-#define DCCP_FEAT_MAX_SP_VALS	(DCCP_SINGLE_OPT_MAXLEN - 2)
-
-enum dccp_feat_type {
-	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
-	FEAT_AT_TX   = 2,	/* located at TX side of half-connection  */
-	FEAT_SP      = 4,	/* server-priority reconciliation (6.3.1) */
-	FEAT_NN	     = 8,	/* non-negotiable reconciliation (6.3.2)  */
-	FEAT_UNKNOWN = 0xFF	/* not understood or invalid feature	  */
-};
-
-enum dccp_feat_state {
-	FEAT_DEFAULT = 0,	/* using default values from 6.4 */
-	FEAT_INITIALISING,	/* feature is being initialised  */
-	FEAT_CHANGING,		/* Change sent but not confirmed yet */
-	FEAT_UNSTABLE,		/* local modification in state CHANGING */
-	FEAT_STABLE		/* both ends (think they) agree */
-};
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern const char *dccp_feat_typename(const u8 type);
+extern const char *dccp_feat_name(const u8 feat);
 
-/**
- * dccp_feat_val  -  Container for SP or NN feature values
- * @nn:     single NN value
- * @sp.vec: single SP value plus optional preference list
- * @sp.len: length of @sp.vec in bytes
- */
-typedef union {
-	u64 nn;
-	struct {
-		u8	*vec;
-		u8	len;
-	}   sp;
-} dccp_feat_val;
-
-/**
- * struct feat_entry  -  Data structure to perform feature negotiation
- * @feat_num: one of %dccp_feature_numbers
- * @val: feature's current value (SP features may have preference list)
- * @state: feature's current state
- * @needs_mandatory: whether Mandatory options should be sent
- * @needs_confirm: whether to send a Confirm instead of a Change
- * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
- * @is_local: feature location (1) or feature-remote (0)
- * @node: list pointers, entries arranged in FIFO order
- */
-struct dccp_feat_entry {
-	u8                      feat_num;
-	dccp_feat_val           val;
-	enum dccp_feat_state    state:8;
-	bool			needs_mandatory:1,
-				needs_confirm:1,
-				empty_confirm:1,
-				is_local:1;
-
-	struct list_head	node;
-};
-
-static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
+static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 {
-	if (entry->needs_confirm)
-		return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
-	return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
+	dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type),
+					   dccp_feat_name(feat), feat, val);
 }
+#else
+#define dccp_feat_debug(type, feat, val)
+#endif /* CONFIG_IP_DCCP_DEBUG */
+
+extern int  dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
+			     u8 *val, u8 len, gfp_t gfp);
+extern int  dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
+				  u8 *val, u8 len);
+extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
+				   u8 *val, u8 len);
+extern void dccp_feat_clean(struct dccp_minisock *dmsk);
+extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
+extern int  dccp_feat_init(struct dccp_minisock *dmsk);
 
-/**
- * struct ccid_dependency  -  Track changes resulting from choosing a CCID
- * @dependent_feat: one of %dccp_feature_numbers
- * @is_local: local (1) or remote (0) @dependent_feat
- * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
- * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
- */
-struct ccid_dependency {
-	u8	dependent_feat;
-	bool	is_local:1,
-		is_mandatory:1;
-	u8	val;
-};
-
-/*
- * Sysctls to seed defaults for feature negotiation
- */
-extern unsigned long sysctl_dccp_sequence_window;
-extern int	     sysctl_dccp_rx_ccid;
-extern int	     sysctl_dccp_tx_ccid;
-
-extern int  dccp_feat_init(struct sock *sk);
-extern void dccp_feat_initialise_sysctls(void);
-extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
-				  u8 const *list, u8 len);
-extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
-extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
-				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
-extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
-
-/*
- * Encoding variable-length options and their maximum length.
- *
- * This affects NN options (SP options are all u8) and other variable-length
- * options (see table 3 in RFC 4340). The limit is currently given the Sequence
- * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
- * options consume less than 6 bytes (timestamps are 4 bytes).
- * When updating this constant (e.g. due to new internet drafts / RFCs), make
- * sure that you also update all code which refers to it.
- */
-#define DCCP_OPTVAL_MAXLEN	6
-
-extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
-extern u64  dccp_decode_value_var(const u8 *bf, const u8 len);
-
-extern int  dccp_insert_option_mandatory(struct sk_buff *skb);
-extern int  dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
-			       u8 *val, u8 len, bool repeat_first);
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index df0e6714aa1..779d0ed9ae9 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -159,15 +159,13 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
 	dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
 }
 
-static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
+static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
 {
-	struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
+	struct dccp_sock *dp = dccp_sk(sk);
 
-	if (av == NULL)
-		return;
-	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
-		dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
-	dccp_ackvec_input(av, skb);
+	if (dccp_msk(sk)->dccpms_send_ack_vector)
+		dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
+					    DCCP_SKB_CB(skb)->dccpd_ack_seq);
 }
 
 static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -366,13 +364,22 @@ discard:
 int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 const struct dccp_hdr *dh, const unsigned len)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
+
 	if (dccp_check_seqno(sk, skb))
 		goto discard;
 
 	if (dccp_parse_options(sk, NULL, skb))
 		return 1;
 
-	dccp_handle_ackvec_processing(sk, skb);
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_event_ack_recv(sk, skb);
+
+	if (dccp_msk(sk)->dccpms_send_ack_vector &&
+	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+			    DCCP_SKB_CB(skb)->dccpd_seq,
+			    DCCP_ACKVEC_STATE_RECEIVED))
+		goto discard;
 	dccp_deliver_input_to_ccids(sk, skb);
 
 	return __dccp_rcv_established(sk, skb, dh, len);
@@ -414,33 +421,40 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			goto out_invalid_packet;
 		}
 
-		/*
-		 * If option processing (Step 8) failed, return 1 here so that
-		 * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
-		 * the option type and is set in dccp_parse_options().
-		 */
 		if (dccp_parse_options(sk, NULL, skb))
-			return 1;
+			goto out_invalid_packet;
 
 		/* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
 		if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
 			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
 			    dp->dccps_options_received.dccpor_timestamp_echo));
 
+		if (dccp_msk(sk)->dccpms_send_ack_vector &&
+		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+				    DCCP_SKB_CB(skb)->dccpd_seq,
+				    DCCP_ACKVEC_STATE_RECEIVED))
+			goto out_invalid_packet; /* FIXME: change error code */
+
 		/* Stop the REQUEST timer */
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 		WARN_ON(sk->sk_send_head == NULL);
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 
+		dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+		dccp_update_gsr(sk, dp->dccps_isr);
 		/*
-		 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
-		 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
-		 * is done as part of activating the feature values below, since
-		 * these settings depend on the local/remote Sequence Window
-		 * features, which were undefined or not confirmed until now.
+		 * SWL and AWL are initially adjusted so that they are not less than
+		 * the initial Sequence Numbers received and sent, respectively:
+		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+		 *	AWL := max(GSS - W' + 1, ISS).
+		 * These adjustments MUST be applied only at the beginning of the
+		 * connection.
+		 *
+		 * AWL was adjusted in dccp_v4_connect -acme
 		 */
-		dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+		dccp_set_seqno(&dp->dccps_swl,
+			       max48(dp->dccps_swl, dp->dccps_isr));
 
 		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 
@@ -461,15 +475,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		 */
 		dccp_set_state(sk, DCCP_PARTOPEN);
 
-		/*
-		 * If feature negotiation was successful, activate features now;
-		 * an activation failure means that this host could not activate
-		 * one ore more features (e.g. insufficient memory), which would
-		 * leave at least one feature in an undefined state.
-		 */
-		if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
-			goto unable_to_proceed;
-
 		/* Make sure socket is routed, for correct metrics. */
 		icsk->icsk_af_ops->rebuild_header(sk);
 
@@ -504,16 +509,6 @@ out_invalid_packet:
 	/* dccp_v4_do_rcv will send a reset */
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
 	return 1;
-
-unable_to_proceed:
-	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
-	/*
-	 * We mark this socket as no longer usable, so that the loop in
-	 * dccp_sendmsg() terminates and the application gets notified.
-	 */
-	dccp_set_state(sk, DCCP_CLOSED);
-	sk->sk_err = ECOMM;
-	return 1;
 }
 
 static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
@@ -595,6 +590,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
 								    skb) < 0)
 				return 1;
+
+			/* FIXME: do congestion control initialization */
 			goto discard;
 		}
 		if (dh->dccph_type == DCCP_PKT_RESET)
@@ -603,35 +600,29 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* Caller (dccp_v4_do_rcv) will send Reset */
 		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
 		return 1;
-	} else if (sk->sk_state == DCCP_CLOSED) {
-		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
-		return 1;
 	}
 
-	/* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
-	if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
-		goto discard;
+	if (sk->sk_state != DCCP_REQUESTING) {
+		if (dccp_check_seqno(sk, skb))
+			goto discard;
 
-	/*
-	 *   Step 7: Check for unexpected packet types
-	 *      If (S.is_server and P.type == Response)
-	 *	    or (S.is_client and P.type == Request)
-	 *	    or (S.state == RESPOND and P.type == Data),
-	 *	  Send Sync packet acknowledging P.seqno
-	 *	  Drop packet and return
-	 */
-	if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
-	     dh->dccph_type == DCCP_PKT_RESPONSE) ||
-	    (dp->dccps_role == DCCP_ROLE_CLIENT &&
-	     dh->dccph_type == DCCP_PKT_REQUEST) ||
-	    (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
-		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
-		goto discard;
-	}
+		/*
+		 * Step 8: Process options and mark acknowledgeable
+		 */
+		if (dccp_parse_options(sk, NULL, skb))
+			return 1;
 
-	/*  Step 8: Process options */
-	if (dccp_parse_options(sk, NULL, skb))
-		return 1;
+		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+			dccp_event_ack_recv(sk, skb);
+
+		if (dccp_msk(sk)->dccpms_send_ack_vector &&
+		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+				    DCCP_SKB_CB(skb)->dccpd_seq,
+				    DCCP_ACKVEC_STATE_RECEIVED))
+			goto discard;
+
+		dccp_deliver_input_to_ccids(sk, skb);
+	}
 
 	/*
 	 *  Step 9: Process Reset
@@ -640,22 +631,44 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	 *		S.state := TIMEWAIT
 	 *		Set TIMEWAIT timer
 	 *		Drop packet and return
-	 */
+	*/
 	if (dh->dccph_type == DCCP_PKT_RESET) {
 		dccp_rcv_reset(sk, skb);
 		return 0;
-	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {	/* Step 13 */
+		/*
+		 *   Step 7: Check for unexpected packet types
+		 *      If (S.is_server and P.type == Response)
+		 *	    or (S.is_client and P.type == Request)
+		 *	    or (S.state == RESPOND and P.type == Data),
+		 *	  Send Sync packet acknowledging P.seqno
+		 *	  Drop packet and return
+		 */
+	} else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+		    dh->dccph_type == DCCP_PKT_RESPONSE) ||
+		    (dp->dccps_role == DCCP_ROLE_CLIENT &&
+		     dh->dccph_type == DCCP_PKT_REQUEST) ||
+		    (sk->sk_state == DCCP_RESPOND &&
+		     dh->dccph_type == DCCP_PKT_DATA)) {
+		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+		goto discard;
+	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
 		if (dccp_rcv_closereq(sk, skb))
 			return 0;
 		goto discard;
-	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {		/* Step 14 */
+	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {
 		if (dccp_rcv_close(sk, skb))
 			return 0;
 		goto discard;
 	}
 
 	switch (sk->sk_state) {
+	case DCCP_CLOSED:
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+		return 1;
+
 	case DCCP_REQUESTING:
+		/* FIXME: do congestion control initialization */
+
 		queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
 		if (queued >= 0)
 			return queued;
@@ -663,12 +676,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		__kfree_skb(skb);
 		return 0;
 
-	case DCCP_PARTOPEN:
-		/* Step 8: if using Ack Vectors, mark packet acknowledgeable */
-		dccp_handle_ackvec_processing(sk, skb);
-		dccp_deliver_input_to_ccids(sk, skb);
-		/* fall through */
 	case DCCP_RESPOND:
+	case DCCP_PARTOPEN:
 		queued = dccp_rcv_respond_partopen_state_process(sk, skb,
 								 dh, len);
 		break;
@@ -707,7 +716,16 @@ u32 dccp_sample_rtt(struct sock *sk, long delta)
 	/* dccpor_elapsed_time is either zeroed out or set and > 0 */
 	delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
 
-	return dccp_sane_rtt(delta);
+	if (unlikely(delta <= 0)) {
+		DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
+		return DCCP_SANE_RTT_MIN;
+	}
+	if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
+		DCCP_WARN("RTT sample %ld too large, using max\n", delta);
+		return DCCP_SANE_RTT_MAX;
+	}
+
+	return delta;
 }
 
 EXPORT_SYMBOL_GPL(dccp_sample_rtt);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b623f6b2548..882c5c4de69 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -545,7 +545,6 @@ out:
 
 static void dccp_v4_reqsk_destructor(struct request_sock *req)
 {
-	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
 	kfree(inet_rsk(req)->opt);
 }
 
@@ -596,8 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
-		goto drop_and_free;
+	dccp_reqsk_init(req, skb);
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ad6212e0043..5e1ee0da2c4 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -302,7 +302,6 @@ done:
 
 static void dccp_v6_reqsk_destructor(struct request_sock *req)
 {
-	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
 	if (inet6_rsk(req)->pktopts != NULL)
 		kfree_skb(inet6_rsk(req)->pktopts);
 }
@@ -425,8 +424,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
-		goto drop_and_free;
+	dccp_reqsk_init(req, skb);
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index f4d9c8f60ed..b2804e2d1b8 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -42,6 +42,16 @@ struct inet_timewait_death_row dccp_death_row = {
 
 EXPORT_SYMBOL_GPL(dccp_death_row);
 
+void dccp_minisock_init(struct dccp_minisock *dmsk)
+{
+	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
+	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
+	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
+	dmsk->dccpms_ack_ratio	     = sysctl_dccp_feat_ack_ratio;
+	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
+	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
+}
+
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
@@ -102,9 +112,10 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
 
 	if (newsk != NULL) {
-		struct dccp_request_sock *dreq = dccp_rsk(req);
+		const struct dccp_request_sock *dreq = dccp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct dccp_sock *newdp = dccp_sk(newsk);
+		struct dccp_minisock *newdmsk = dccp_msk(newsk);
 
 		newdp->dccps_role	    = DCCP_ROLE_SERVER;
 		newdp->dccps_hc_rx_ackvec   = NULL;
@@ -114,32 +125,65 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
 		newicsk->icsk_rto	    = DCCP_TIMEOUT_INIT;
 
-		INIT_LIST_HEAD(&newdp->dccps_featneg);
+		if (dccp_feat_clone(sk, newsk))
+			goto out_free;
+
+		if (newdmsk->dccpms_send_ack_vector) {
+			newdp->dccps_hc_rx_ackvec =
+						dccp_ackvec_alloc(GFP_ATOMIC);
+			if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
+				goto out_free;
+		}
+
+		newdp->dccps_hc_rx_ccid =
+			    ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
+					   newsk, GFP_ATOMIC);
+		newdp->dccps_hc_tx_ccid =
+			    ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
+					   newsk, GFP_ATOMIC);
+		if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
+			     newdp->dccps_hc_tx_ccid == NULL)) {
+			dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
+			ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
+			ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
+out_free:
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			sk_free(newsk);
+			return NULL;
+		}
+
 		/*
 		 * Step 3: Process LISTEN state
 		 *
 		 *    Choose S.ISS (initial seqno) or set from Init Cookies
 		 *    Initialize S.GAR := S.ISS
-		 *    Set S.ISR, S.GSR from packet (or Init Cookies)
-		 *
-		 *    Setting AWL/AWH and SWL/SWH happens as part of the feature
-		 *    activation below, as these windows all depend on the local
-		 *    and remote Sequence Window feature values (7.5.2).
+		 *    Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
 		 */
-		newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
-		newdp->dccps_gar = newdp->dccps_iss;
-		newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
+
+		/* See dccp_v4_conn_request */
+		newdmsk->dccpms_sequence_window = req->rcv_wnd;
+
+		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
+		dccp_update_gss(newsk, dreq->dreq_iss);
+
+		newdp->dccps_isr = dreq->dreq_isr;
+		dccp_update_gsr(newsk, dreq->dreq_isr);
 
 		/*
-		 * Activate features: initialise CCIDs, sequence windows etc.
+		 * SWL and AWL are initially adjusted so that they are not less than
+		 * the initial Sequence Numbers received and sent, respectively:
+		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+		 *	AWL := max(GSS - W' + 1, ISS).
+		 * These adjustments MUST be applied only at the beginning of the
+		 * connection.
 		 */
-		if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
-			/* It is still raw copy of parent, so invalidate
-			 * destructor and make plain sk_free() */
-			newsk->sk_destruct = NULL;
-			sk_free(newsk);
-			return NULL;
-		}
+		dccp_set_seqno(&newdp->dccps_swl,
+			       max48(newdp->dccps_swl, newdp->dccps_isr));
+		dccp_set_seqno(&newdp->dccps_awl,
+			       max48(newdp->dccps_awl, newdp->dccps_iss));
+
 		dccp_init_xmit_timers(newsk);
 
 		DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
@@ -260,17 +304,14 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
 
-int dccp_reqsk_init(struct request_sock *req,
-		    struct dccp_sock const *dp, struct sk_buff const *skb)
+void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
 {
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
 	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
 	inet_rsk(req)->acked	  = 0;
+	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
-
-	/* inherit feature negotiation options from listening socket */
-	return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
 }
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index e5a32979d7d..0809b63cb05 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,20 +23,23 @@
 #include "dccp.h"
 #include "feat.h"
 
-u64 dccp_decode_value_var(const u8 *bf, const u8 len)
+int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
+int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
+int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
+int sysctl_dccp_feat_ack_ratio	      = DCCPF_INITIAL_ACK_RATIO;
+int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
+int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
+
+static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
 {
-	u64 value = 0;
+	u32 value = 0;
 
-	if (len >= DCCP_OPTVAL_MAXLEN)
-		value += ((u64)*bf++) << 40;
-	if (len > 4)
-		value += ((u64)*bf++) << 32;
 	if (len > 3)
-		value += ((u64)*bf++) << 24;
+		value += *bf++ << 24;
 	if (len > 2)
-		value += ((u64)*bf++) << 16;
+		value += *bf++ << 16;
 	if (len > 1)
-		value += ((u64)*bf++) << 8;
+		value += *bf++ << 8;
 	if (len > 0)
 		value += *bf;
 
@@ -54,6 +57,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	struct dccp_sock *dp = dccp_sk(sk);
 	const struct dccp_hdr *dh = dccp_hdr(skb);
 	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+	u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
 	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
 	unsigned char *opt_ptr = options;
 	const unsigned char *opt_end = (unsigned char *)dh +
@@ -95,11 +99,18 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		}
 
 		/*
+		 * CCID-Specific Options (from RFC 4340, sec. 10.3):
+		 *
+		 * Option numbers 128 through 191 are for options sent from the
+		 * HC-Sender to the HC-Receiver; option numbers 192 through 255
+		 * are for options sent from the HC-Receiver to	the HC-Sender.
+		 *
 		 * CCID-specific options are ignored during connection setup, as
 		 * negotiation may still be in progress (see RFC 4340, 10.3).
 		 * The same applies to Ack Vectors, as these depend on the CCID.
+		 *
 		 */
-		if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
+		if (dreq != NULL && (opt >= 128 ||
 		    opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
 			goto ignore_option;
 
@@ -120,13 +131,43 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
 				      (unsigned long long)opt_recv->dccpor_ndp);
 			break;
-		case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
-			if (pkt_type == DCCP_PKT_DATA)      /* RFC 4340, 6 */
+		case DCCPO_CHANGE_L:
+			/* fall through */
+		case DCCPO_CHANGE_R:
+			if (pkt_type == DCCP_PKT_DATA)
 				break;
-			rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
-						    *value, value + 1, len - 1);
-			if (rc)
-				goto out_featneg_failed;
+			if (len < 2)
+				goto out_invalid_option;
+			rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
+						   len - 1);
+			/*
+			 * When there is a change error, change_recv is
+			 * responsible for dealing with it.  i.e. reply with an
+			 * empty confirm.
+			 * If the change was mandatory, then we need to die.
+			 */
+			if (rc && mandatory)
+				goto out_invalid_option;
+			break;
+		case DCCPO_CONFIRM_L:
+			/* fall through */
+		case DCCPO_CONFIRM_R:
+			if (pkt_type == DCCP_PKT_DATA)
+				break;
+			if (len < 2)	/* FIXME this disallows empty confirm */
+				goto out_invalid_option;
+			if (dccp_feat_confirm_recv(sk, opt, *value,
+						   value + 1, len - 1))
+				goto out_invalid_option;
+			break;
+		case DCCPO_ACK_VECTOR_0:
+		case DCCPO_ACK_VECTOR_1:
+			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
+				break;
+
+			if (dccp_msk(sk)->dccpms_send_ack_vector &&
+			    dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
+				goto out_invalid_option;
 			break;
 		case DCCPO_TIMESTAMP:
 			if (len != 4)
@@ -154,8 +195,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 				      dccp_role(sk), ntohl(opt_val),
 				      (unsigned long long)
 				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
-			/* schedule an Ack in case this sender is quiescent */
-			inet_csk_schedule_ack(sk);
 			break;
 		case DCCPO_TIMESTAMP_ECHO:
 			if (len != 4 && len != 6 && len != 8)
@@ -212,25 +251,23 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
 				      dccp_role(sk), elapsed_time);
 			break;
-		case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
+		case 128 ... 191: {
+			const u16 idx = value - options;
+
 			if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
-						     pkt_type, opt, value, len))
+						     opt, len, idx,
+						     value) != 0)
 				goto out_invalid_option;
+		}
 			break;
-		case DCCPO_ACK_VECTOR_0:
-		case DCCPO_ACK_VECTOR_1:
-			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
-				break;
-			/*
-			 * Ack vectors are processed by the TX CCID if it is
-			 * interested. The RX CCID need not parse Ack Vectors,
-			 * since it is only interested in clearing old state.
-			 * Fall through.
-			 */
-		case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
+		case 192 ... 255: {
+			const u16 idx = value - options;
+
 			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
-						     pkt_type, opt, value, len))
+						     opt, len, idx,
+						     value) != 0)
 				goto out_invalid_option;
+		}
 			break;
 		default:
 			DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -252,10 +289,8 @@ out_nonsensical_length:
 
 out_invalid_option:
 	DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
-	rc = DCCP_RESET_CODE_OPTION_ERROR;
-out_featneg_failed:
-	DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
-	DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
+	DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
 	DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
 	DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
 	DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
@@ -264,12 +299,9 @@ out_featneg_failed:
 
 EXPORT_SYMBOL_GPL(dccp_parse_options);
 
-void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
+static void dccp_encode_value_var(const u32 value, unsigned char *to,
+				  const unsigned int len)
 {
-	if (len >= DCCP_OPTVAL_MAXLEN)
-		*to++ = (value & 0xFF0000000000ull) >> 40;
-	if (len > 4)
-		*to++ = (value & 0xFF00000000ull) >> 32;
 	if (len > 3)
 		*to++ = (value & 0xFF000000) >> 24;
 	if (len > 2)
@@ -429,140 +461,92 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
 	return 0;
 }
 
-static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
+				u8 *val, u8 len)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
-	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-	const u16 buflen = dccp_ackvec_buflen(av);
-	/* Figure out how many options do we need to represent the ackvec */
-	const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = buflen + 2 * nr_opts;
-	u8 i, nonce = 0;
-	const unsigned char *tail, *from;
-	unsigned char *to;
+	u8 *to;
 
-	if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
-		DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
-			  dccp_packet_name(dcb->dccpd_type));
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) {
+		DCCP_WARN("packet too small for feature %d option!\n", feat);
 		return -1;
 	}
-	/*
-	 * Since Ack Vectors are variable-length, we can not always predict
-	 * their size. To catch exception cases where the space is running out
-	 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
-	 */
-	if (len > DCCPAV_MIN_OPTLEN &&
-	    len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
-		DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
-			  "MPS=%u ==> reduce payload size?\n", len, skb->len,
-			  dcb->dccpd_opt_len, dp->dccps_mss_cache);
-		dp->dccps_sync_scheduled = 1;
-		return 0;
-	}
-	dcb->dccpd_opt_len += len;
 
-	to   = skb_push(skb, len);
-	len  = buflen;
-	from = av->av_buf + av->av_buf_head;
-	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3;
 
-	for (i = 0; i < nr_opts; ++i) {
-		int copylen = len;
-
-		if (len > DCCP_SINGLE_OPT_MAXLEN)
-			copylen = DCCP_SINGLE_OPT_MAXLEN;
-
-		/*
-		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
-		 * its type; ack_nonce is the sum of all individual buf_nonce's.
-		 */
-		nonce ^= av->av_buf_nonce[i];
-
-		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
-		*to++ = copylen + 2;
-
-		/* Check if buf_head wraps */
-		if (from + copylen > tail) {
-			const u16 tailsize = tail - from;
-
-			memcpy(to, from, tailsize);
-			to	+= tailsize;
-			len	-= tailsize;
-			copylen	-= tailsize;
-			from	= av->av_buf;
-		}
-
-		memcpy(to, from, copylen);
-		from += copylen;
-		to   += copylen;
-		len  -= copylen;
-	}
-	/*
-	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
-	 */
-	if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
-		return -ENOBUFS;
-	return 0;
-}
+	to    = skb_push(skb, len + 3);
+	*to++ = type;
+	*to++ = len + 3;
+	*to++ = feat;
 
-/**
- * dccp_insert_option_mandatory  -  Mandatory option (5.8.2)
- * Note that since we are using skb_push, this function needs to be called
- * _after_ inserting the option it is supposed to influence (stack order).
- */
-int dccp_insert_option_mandatory(struct sk_buff *skb)
-{
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
-		return -1;
+	if (len)
+		memcpy(to, val, len);
 
-	DCCP_SKB_CB(skb)->dccpd_opt_len++;
-	*skb_push(skb, 1) = DCCPO_MANDATORY;
+	dccp_pr_debug("%s(%s (%d), ...), length %d\n",
+		      dccp_feat_typename(type),
+		      dccp_feat_name(feat), feat, len);
 	return 0;
 }
 
-/**
- * dccp_insert_fn_opt  -  Insert single Feature-Negotiation option into @skb
- * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
- * @feat: one out of %dccp_feature_numbers
- * @val: NN value or SP array (preferred element first) to copy
- * @len: true length of @val in bytes (excluding first element repetition)
- * @repeat_first: whether to copy the first element of @val twice
- * The last argument is used to construct Confirm options, where the preferred
- * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
- * lists are kept such that the preferred entry is always first, so we only need
- * to copy twice, and avoid the overhead of cloning into a bigger array.
- */
-int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
-		       u8 *val, u8 len, bool repeat_first)
+static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
 {
-	u8 tot_len, *to;
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	struct dccp_opt_pend *opt, *next;
+	int change = 0;
+
+	/* confirm any options [NN opts] */
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
+		dccp_insert_feat_opt(skb, opt->dccpop_type,
+				     opt->dccpop_feat, opt->dccpop_val,
+				     opt->dccpop_len);
+		/* fear empty confirms */
+		if (opt->dccpop_val)
+			kfree(opt->dccpop_val);
+		kfree(opt);
+	}
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
+
+	/* see which features we need to send */
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		/* see if we need to send any confirm */
+		if (opt->dccpop_sc) {
+			dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
+					     opt->dccpop_feat,
+					     opt->dccpop_sc->dccpoc_val,
+					     opt->dccpop_sc->dccpoc_len);
+
+			BUG_ON(!opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc);
+			opt->dccpop_sc = NULL;
+		}
 
-	/* take the `Feature' field and possible repetition into account */
-	if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
-		DCCP_WARN("length %u for feature %u too large\n", len, feat);
-		return -1;
+		/* any option not confirmed, re-send it */
+		if (!opt->dccpop_conf) {
+			dccp_insert_feat_opt(skb, opt->dccpop_type,
+					     opt->dccpop_feat, opt->dccpop_val,
+					     opt->dccpop_len);
+			change++;
+		}
 	}
 
-	if (unlikely(val == NULL || len == 0))
-		len = repeat_first = 0;
-	tot_len = 3 + repeat_first + len;
+	/* Retransmit timer.
+	 * If this is the master listening sock, we don't set a timer on it.  It
+	 * should be fine because if the dude doesn't receive our RESPONSE
+	 * [which will contain the CHANGE] he will send another REQUEST which
+	 * will "retrnasmit" the change.
+	 */
+	if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
+		dccp_pr_debug("reset feat negotiation timer %p\n", sk);
 
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
-		DCCP_WARN("packet too small for feature %d option!\n", feat);
-		return -1;
+		/* XXX don't reset the timer on re-transmissions.  I.e. reset it
+		 * only when sending new stuff i guess.  Currently the timer
+		 * never backs off because on re-transmission it just resets it!
+		 */
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
 	}
-	DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
-
-	to    = skb_push(skb, tot_len);
-	*to++ = type;
-	*to++ = tot_len;
-	*to++ = feat;
 
-	if (repeat_first)
-		*to++ = *val;
-	if (len)
-		memcpy(to, val, len);
 	return 0;
 }
 
@@ -581,30 +565,19 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
 int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
+	if (dmsk->dccpms_send_ndp_count &&
+	    dccp_insert_option_ndp(sk, skb))
 		return -1;
 
-	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
-
-		/* Feature Negotiation */
-		if (dccp_feat_insert_opts(dp, NULL, skb))
+	if (!dccp_packet_without_ack(skb)) {
+		if (dmsk->dccpms_send_ack_vector &&
+		    dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
+		    dccp_insert_option_ackvec(sk, skb))
 			return -1;
-
-		if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
-			/*
-			 * Obtain RTT sample from Request/Response exchange.
-			 * This is currently used in CCID 3 initialisation.
-			 */
-			if (dccp_insert_option_timestamp(sk, skb))
-				return -1;
-
-		} else if (dccp_ackvec_pending(sk) &&
-			   dccp_insert_option_ackvec(sk, skb)) {
-				return -1;
-		}
 	}
 
 	if (dp->dccps_hc_rx_insert_options) {
@@ -613,6 +586,21 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 		dp->dccps_hc_rx_insert_options = 0;
 	}
 
+	/* Feature negotiation */
+	/* Data packets can't do feat negotiation */
+	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA &&
+	    DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK &&
+	    dccp_insert_options_feat(sk, skb))
+		return -1;
+
+	/*
+	 * Obtain RTT sample from Request/Response exchange.
+	 * This is currently used in CCID 3 initialisation.
+	 */
+	if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST &&
+	    dccp_insert_option_timestamp(sk, skb))
+		return -1;
+
 	if (dp->dccps_timestamp_echo != 0 &&
 	    dccp_insert_option_timestamp_echo(dp, NULL, skb))
 		return -1;
@@ -625,9 +613,6 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
 {
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dccp_feat_insert_opts(NULL, dreq, skb))
-		return -1;
-
 	if (dreq->dreq_timestamp_echo != 0 &&
 	    dccp_insert_option_timestamp_echo(NULL, dreq, skb))
 		return -1;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 2532797a800..d06945c7d3d 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -26,13 +26,11 @@ static inline void dccp_event_ack_sent(struct sock *sk)
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
-/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
-static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
 {
 	skb_set_owner_w(skb, sk);
 	WARN_ON(sk->sk_send_head);
 	sk->sk_send_head = skb;
-	return skb_clone(sk->sk_send_head, gfp_any());
 }
 
 /*
@@ -163,27 +161,21 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
 	u32 ccmps = dccp_determine_ccmps(dp);
-	u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+	int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
 
 	/* Account for header lengths and IPv4/v6 option overhead */
 	cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
 		    sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
 
 	/*
-	 * Leave enough headroom for common DCCP header options.
-	 * This only considers options which may appear on DCCP-Data packets, as
-	 * per table 3 in RFC 4340, 5.8. When running out of space for other
-	 * options (eg. Ack Vector which can take up to 255 bytes), it is better
-	 * to schedule a separate Ack. Thus we leave headroom for the following:
-	 *  - 1 byte for Slow Receiver (11.6)
-	 *  - 6 bytes for Timestamp (13.1)
-	 *  - 10 bytes for Timestamp Echo (13.3)
-	 *  - 8 bytes for NDP count (7.7, when activated)
-	 *  - 6 bytes for Data Checksum (9.3)
-	 *  - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
+	 * FIXME: this should come from the CCID infrastructure, where, say,
+	 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
+	 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
+	 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
+	 * make it a multiple of 4
 	 */
-	cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
-			   (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
+
+	cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
 
 	/* And store cached results */
 	icsk->icsk_pmtu_cookie = pmtu;
@@ -208,158 +200,95 @@ void dccp_write_space(struct sock *sk)
 }
 
 /**
- * dccp_wait_for_ccid  -  Await CCID send permission
+ * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
  * @sk:    socket to wait for
- * @delay: timeout in jiffies
- * This is used by CCIDs which need to delay the send time in process context.
+ * @skb:   current skb to pass on for waiting
+ * @delay: sleep timeout in milliseconds (> 0)
+ * This function is called by default when the socket is closed, and
+ * when a non-zero linger time is set on the socket. For consistency
  */
-static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
+static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
 	DEFINE_WAIT(wait);
-	long remaining;
-
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
-	sk->sk_write_pending++;
-	release_sock(sk);
+	unsigned long jiffdelay;
+	int rc;
 
-	remaining = schedule_timeout(delay);
-
-	lock_sock(sk);
-	sk->sk_write_pending--;
-	finish_wait(sk->sk_sleep, &wait);
+	do {
+		dccp_pr_debug("delayed send by %d msec\n", delay);
+		jiffdelay = msecs_to_jiffies(delay);
 
-	if (signal_pending(current) || sk->sk_err)
-		return -1;
-	return remaining;
-}
-
-/**
- * dccp_xmit_packet  -  Send data packet under control of CCID
- * Transmits next-queued payload and informs CCID to account for the packet.
- */
-static void dccp_xmit_packet(struct sock *sk)
-{
-	int err, len;
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct sk_buff *skb = dccp_qpolicy_pop(sk);
+		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 
-	if (unlikely(skb == NULL))
-		return;
-	len = skb->len;
+		sk->sk_write_pending++;
+		release_sock(sk);
+		schedule_timeout(jiffdelay);
+		lock_sock(sk);
+		sk->sk_write_pending--;
 
-	if (sk->sk_state == DCCP_PARTOPEN) {
-		const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
-		/*
-		 * See 8.1.5 - Handshake Completion.
-		 *
-		 * For robustness we resend Confirm options until the client has
-		 * entered OPEN. During the initial feature negotiation, the MPS
-		 * is smaller than usual, reduced by the Change/Confirm options.
-		 */
-		if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
-			DCCP_WARN("Payload too large (%d) for featneg.\n", len);
-			dccp_send_ack(sk);
-			dccp_feat_list_purge(&dp->dccps_featneg);
-		}
+		if (sk->sk_err)
+			goto do_error;
+		if (signal_pending(current))
+			goto do_interrupted;
 
-		inet_csk_schedule_ack(sk);
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-					      inet_csk(sk)->icsk_rto,
-					      DCCP_RTO_MAX);
-		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
-	} else if (dccp_ack_pending(sk)) {
-		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
-	} else {
-		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
-	}
-
-	err = dccp_transmit_skb(sk, skb);
-	if (err)
-		dccp_pr_debug("transmit_skb() returned err=%d\n", err);
-	/*
-	 * Register this one as sent even if an error occurred. To the remote
-	 * end a local packet drop is indistinguishable from network loss, i.e.
-	 * any local drop will eventually be reported via receiver feedback.
-	 */
-	ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
-
-	/*
-	 * If the CCID needs to transfer additional header options out-of-band
-	 * (e.g. Ack Vectors or feature-negotiation options), it activates this
-	 * flag to schedule a Sync. The Sync will automatically incorporate all
-	 * currently pending header options, thus clearing the backlog.
-	 */
-	if (dp->dccps_sync_scheduled)
-		dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+	} while ((delay = rc) > 0);
+out:
+	finish_wait(sk->sk_sleep, &wait);
+	return rc;
+
+do_error:
+	rc = -EPIPE;
+	goto out;
+do_interrupted:
+	rc = -EINTR;
+	goto out;
 }
 
-/**
- * dccp_flush_write_queue  -  Drain queue at end of connection
- * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
- * happen that the TX queue is not empty at the end of a connection. We give the
- * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
- * returns with a non-empty write queue, it will be purged later.
- */
-void dccp_flush_write_queue(struct sock *sk, long *time_budget)
+void dccp_write_xmit(struct sock *sk, int block)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
-	long delay, rc;
-
-	while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
-		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
 
-		switch (ccid_packet_dequeue_eval(rc)) {
-		case CCID_PACKET_WILL_DEQUEUE_LATER:
-			/*
-			 * If the CCID determines when to send, the next sending
-			 * time is unknown or the CCID may not even send again
-			 * (e.g. remote host crashes or lost Ack packets).
-			 */
-			DCCP_WARN("CCID did not manage to send all packets\n");
-			return;
-		case CCID_PACKET_DELAY:
-			delay = msecs_to_jiffies(rc);
-			if (delay > *time_budget)
-				return;
-			rc = dccp_wait_for_ccid(sk, delay);
-			if (rc < 0)
-				return;
-			*time_budget -= (delay - rc);
-			/* check again if we can send now */
-			break;
-		case CCID_PACKET_SEND_AT_ONCE:
-			dccp_xmit_packet(sk);
-			break;
-		case CCID_PACKET_ERR:
-			skb_dequeue(&sk->sk_write_queue);
-			kfree_skb(skb);
-			dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+	while ((skb = skb_peek(&sk->sk_write_queue))) {
+		int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+		if (err > 0) {
+			if (!block) {
+				sk_reset_timer(sk, &dp->dccps_xmit_timer,
+						msecs_to_jiffies(err)+jiffies);
+				break;
+			} else
+				err = dccp_wait_for_ccid(sk, skb, err);
+			if (err && err != -EINTR)
+				DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
 		}
-	}
-}
 
-void dccp_write_xmit(struct sock *sk)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct sk_buff *skb;
+		skb_dequeue(&sk->sk_write_queue);
+		if (err == 0) {
+			struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+			const int len = skb->len;
 
-	while ((skb = dccp_qpolicy_top(sk))) {
-		int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
-		switch (ccid_packet_dequeue_eval(rc)) {
-		case CCID_PACKET_WILL_DEQUEUE_LATER:
-			return;
-		case CCID_PACKET_DELAY:
-			sk_reset_timer(sk, &dp->dccps_xmit_timer,
-				       jiffies + msecs_to_jiffies(rc));
-			return;
-		case CCID_PACKET_SEND_AT_ONCE:
-			dccp_xmit_packet(sk);
-			break;
-		case CCID_PACKET_ERR:
-			dccp_qpolicy_drop(sk, skb);
-			dccp_pr_debug("packet discarded due to err=%d\n", rc);
+			if (sk->sk_state == DCCP_PARTOPEN) {
+				/* See 8.1.5.  Handshake Completion */
+				inet_csk_schedule_ack(sk);
+				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  inet_csk(sk)->icsk_rto,
+						  DCCP_RTO_MAX);
+				dcb->dccpd_type = DCCP_PKT_DATAACK;
+			} else if (dccp_ack_pending(sk))
+				dcb->dccpd_type = DCCP_PKT_DATAACK;
+			else
+				dcb->dccpd_type = DCCP_PKT_DATA;
+
+			err = dccp_transmit_skb(sk, skb);
+			ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+			if (err)
+				DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
+					 err);
+		} else {
+			dccp_pr_debug("packet discarded due to err=%d\n", err);
+			kfree_skb(skb);
 		}
 	}
 }
@@ -410,12 +339,10 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
 	DCCP_SKB_CB(skb)->dccpd_seq  = dreq->dreq_iss;
 
-	/* Resolve feature dependencies resulting from choice of CCID */
-	if (dccp_feat_server_ccid_dependencies(dreq))
-		goto response_failed;
-
-	if (dccp_insert_options_rsk(dreq, skb))
-		goto response_failed;
+	if (dccp_insert_options_rsk(dreq, skb)) {
+		kfree_skb(skb);
+		return NULL;
+	}
 
 	/* Build and checksum header */
 	dh = dccp_zeroed_hdr(skb, dccp_header_size);
@@ -436,9 +363,6 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	inet_rsk(req)->acked = 1;
 	DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
 	return skb;
-response_failed:
-	kfree_skb(skb);
-	return NULL;
 }
 
 EXPORT_SYMBOL_GPL(dccp_make_response);
@@ -523,9 +447,8 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
 /*
  * Do all connect socket setups that can be done AF independent.
  */
-int dccp_connect(struct sock *sk)
+static inline void dccp_connect_init(struct sock *sk)
 {
-	struct sk_buff *skb;
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -535,13 +458,19 @@ int dccp_connect(struct sock *sk)
 
 	dccp_sync_mss(sk, dst_mtu(dst));
 
-	/* do not connect if feature negotiation setup fails */
-	if (dccp_feat_finalise_settings(dccp_sk(sk)))
-		return -EPROTO;
-
 	/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
 	dp->dccps_gar = dp->dccps_iss;
 
+	icsk->icsk_retransmits = 0;
+}
+
+int dccp_connect(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	dccp_connect_init(sk);
+
 	skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
 	if (unlikely(skb == NULL))
 		return -ENOBUFS;
@@ -551,11 +480,11 @@ int dccp_connect(struct sock *sk)
 
 	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
 
-	dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
+	dccp_skb_entail(sk, skb);
+	dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
 	DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the REQUEST until an answer. */
-	icsk->icsk_retransmits = 0;
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 				  icsk->icsk_rto, DCCP_RTO_MAX);
 	return 0;
@@ -642,12 +571,6 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
 	DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
 	DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
 
-	/*
-	 * Clear the flag in case the Sync was scheduled for out-of-band data,
-	 * such as carrying a long Ack Vector.
-	 */
-	dccp_sk(sk)->dccps_sync_scheduled = 0;
-
 	dccp_transmit_skb(sk, skb);
 }
 
@@ -676,7 +599,9 @@ void dccp_send_close(struct sock *sk, const int active)
 		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
 
 	if (active) {
-		skb = dccp_skb_entail(sk, skb);
+		dccp_write_xmit(sk, 1);
+		dccp_skb_entail(sk, skb);
+		dccp_transmit_skb(sk, skb_clone(skb, prio));
 		/*
 		 * Retransmission timer for active-close: RFC 4340, 8.3 requires
 		 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
@@ -689,6 +614,6 @@ void dccp_send_close(struct sock *sk, const int active)
 		 */
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 					  DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
-	}
-	dccp_transmit_skb(sk, skb);
+	} else
+		dccp_transmit_skb(sk, skb);
 }
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index eaa59d82ab0..81368a7f537 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -46,54 +46,75 @@ static struct {
 	struct kfifo	  *fifo;
 	spinlock_t	  lock;
 	wait_queue_head_t wait;
-	ktime_t		  start;
+	struct timespec	  tstart;
 } dccpw;
 
-static void jdccp_write_xmit(struct sock *sk)
+static void printl(const char *fmt, ...)
 {
-	const struct inet_sock *inet = inet_sk(sk);
-	struct ccid3_hc_tx_sock *hctx = NULL;
-	struct timespec tv;
-	char buf[256];
-	int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk));
+	va_list args;
+	int len;
+	struct timespec now;
+	char tbuf[256];
 
-	if (ccid == DCCPC_CCID3)
-		hctx = ccid3_hc_tx_sk(sk);
+	va_start(args, fmt);
+	getnstimeofday(&now);
 
-	if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) {
+	now = timespec_sub(now, dccpw.tstart);
 
-		tv  = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start));
-		len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d",
-			       (unsigned long)tv.tv_sec,
-			       (unsigned long)tv.tv_nsec,
-			       NIPQUAD(inet->saddr), ntohs(inet->sport),
-			       NIPQUAD(inet->daddr), ntohs(inet->dport), ccid);
+	len = sprintf(tbuf, "%lu.%06lu ",
+		      (unsigned long) now.tv_sec,
+		      (unsigned long) now.tv_nsec / NSEC_PER_USEC);
+	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+	va_end(args);
 
+	kfifo_put(dccpw.fifo, tbuf, len);
+	wake_up(&dccpw.wait);
+}
+
+static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+			 struct msghdr *msg, size_t size)
+{
+	const struct dccp_minisock *dmsk = dccp_msk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ccid3_hc_tx_sock *hctx;
+
+	if (dmsk->dccpms_tx_ccid == DCCPC_CCID3)
+		hctx = ccid3_hc_tx_sk(sk);
+	else
+		hctx = NULL;
+
+	if (port == 0 || ntohs(inet->dport) == port ||
+	    ntohs(inet->sport) == port) {
 		if (hctx)
-			len += sprintf(buf + len, " %d %d %d %u %u %u %d",
-			       hctx->s, hctx->rtt, hctx->p, hctx->x_calc,
-			       (unsigned)(hctx->x_recv >> 6),
-			       (unsigned)(hctx->x >> 6), hctx->t_ipi);
-
-		len += sprintf(buf + len, "\n");
-		kfifo_put(dccpw.fifo, buf, len);
-		wake_up(&dccpw.wait);
+			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u "
+			       "%llu %llu %d\n",
+			       NIPQUAD(inet->saddr), ntohs(inet->sport),
+			       NIPQUAD(inet->daddr), ntohs(inet->dport), size,
+			       hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
+			       hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc,
+			       hctx->ccid3hctx_x_recv >> 6,
+			       hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi);
+		else
+			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
+			       NIPQUAD(inet->saddr), ntohs(inet->sport),
+			       NIPQUAD(inet->daddr), ntohs(inet->dport), size);
 	}
 
 	jprobe_return();
+	return 0;
 }
 
 static struct jprobe dccp_send_probe = {
 	.kp	= {
-		.symbol_name = "dccp_write_xmit",
+		.symbol_name = "dccp_sendmsg",
 	},
-	.entry	= jdccp_write_xmit,
+	.entry	= jdccp_sendmsg,
 };
 
 static int dccpprobe_open(struct inode *inode, struct file *file)
 {
 	kfifo_reset(dccpw.fifo);
-	dccpw.start = ktime_get();
+	getnstimeofday(&dccpw.tstart);
 	return 0;
 }
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ecf3be961e1..d0bd3481976 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -67,9 +67,6 @@ void dccp_set_state(struct sock *sk, const int state)
 	case DCCP_OPEN:
 		if (oldstate != DCCP_OPEN)
 			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
-		/* Client retransmits all Confirm options until entering OPEN */
-		if (oldstate == DCCP_PARTOPEN)
-			dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
 		break;
 
 	case DCCP_CLOSED:
@@ -178,25 +175,63 @@ EXPORT_SYMBOL_GPL(dccp_state_name);
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	dccp_minisock_init(&dp->dccps_minisock);
+
 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
 	sk->sk_write_space	= dccp_write_space;
 	icsk->icsk_sync_mss	= dccp_sync_mss;
-	dp->dccps_mss_cache	= TCP_MIN_RCVMSS;
+	dp->dccps_mss_cache	= 536;
 	dp->dccps_rate_last	= jiffies;
 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
-	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
+	dp->dccps_l_ack_ratio	= dp->dccps_r_ack_ratio = 1;
 
 	dccp_init_xmit_timers(sk);
 
-	INIT_LIST_HEAD(&dp->dccps_featneg);
-	/* control socket doesn't need feat nego */
-	if (likely(ctl_sock_initialized))
-		return dccp_feat_init(sk);
+	/*
+	 * FIXME: We're hardcoding the CCID, and doing this at this point makes
+	 * the listening (master) sock get CCID control blocks, which is not
+	 * necessary, but for now, to not mess with the test userspace apps,
+	 * lets leave it here, later the real solution is to do this in a
+	 * setsockopt(CCIDs-I-want/accept). -acme
+	 */
+	if (likely(ctl_sock_initialized)) {
+		int rc = dccp_feat_init(dmsk);
+
+		if (rc)
+			return rc;
+
+		if (dmsk->dccpms_send_ack_vector) {
+			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
+			if (dp->dccps_hc_rx_ackvec == NULL)
+				return -ENOMEM;
+		}
+		dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
+						      sk, GFP_KERNEL);
+		dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
+						      sk, GFP_KERNEL);
+		if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
+			     dp->dccps_hc_tx_ccid == NULL)) {
+			ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+			ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+			if (dmsk->dccpms_send_ack_vector) {
+				dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+				dp->dccps_hc_rx_ackvec = NULL;
+			}
+			dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+			return -ENOMEM;
+		}
+	} else {
+		/* control socket doesn't need feat nego */
+		INIT_LIST_HEAD(&dmsk->dccpms_pending);
+		INIT_LIST_HEAD(&dmsk->dccpms_conf);
+	}
+
 	return 0;
 }
 
@@ -205,6 +240,7 @@ EXPORT_SYMBOL_GPL(dccp_init_sock);
 void dccp_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	/*
 	 * DCCP doesn't use sk_write_queue, just sk_send_head
@@ -222,7 +258,7 @@ void dccp_destroy_sock(struct sock *sk)
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
 
-	if (dp->dccps_hc_rx_ackvec != NULL) {
+	if (dmsk->dccpms_send_ack_vector) {
 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
@@ -231,7 +267,7 @@ void dccp_destroy_sock(struct sock *sk)
 	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
 
 	/* clean up feature negotiation state */
-	dccp_feat_list_purge(&dp->dccps_featneg);
+	dccp_feat_clean(dmsk);
 }
 
 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
@@ -241,9 +277,6 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
 	struct dccp_sock *dp = dccp_sk(sk);
 
 	dp->dccps_role = DCCP_ROLE_LISTEN;
-	/* do not start to listen if feature negotiation setup fails */
-	if (dccp_feat_finalise_settings(dp))
-		return -EPROTO;
 	return inet_csk_listen_start(sk, backlog);
 }
 
@@ -433,70 +466,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
 	return 0;
 }
 
-static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
+/* byte 1 is feature.  the rest is the preference list */
+static int dccp_setsockopt_change(struct sock *sk, int type,
+				  struct dccp_so_feat __user *optval)
 {
-	u8 *list, len;
-	int i, rc;
+	struct dccp_so_feat opt;
+	u8 *val;
+	int rc;
 
-	if (cscov < 0 || cscov > 15)
-		return -EINVAL;
+	if (copy_from_user(&opt, optval, sizeof(opt)))
+		return -EFAULT;
 	/*
-	 * Populate a list of permissible values, in the range cscov...15. This
-	 * is necessary since feature negotiation of single values only works if
-	 * both sides incidentally choose the same value. Since the list starts
-	 * lowest-value first, negotiation will pick the smallest shared value.
+	 * rfc4340: 6.1. Change Options
 	 */
-	if (cscov == 0)
-		return 0;
-	len = 16 - cscov;
-
-	list = kmalloc(len, GFP_KERNEL);
-	if (list == NULL)
-		return -ENOBUFS;
-
-	for (i = 0; i < len; i++)
-		list[i] = cscov++;
-
-	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
-
-	if (rc == 0) {
-		if (rx)
-			dccp_sk(sk)->dccps_pcrlen = cscov;
-		else
-			dccp_sk(sk)->dccps_pcslen = cscov;
-	}
-	kfree(list);
-	return rc;
-}
-
-static int dccp_setsockopt_ccid(struct sock *sk, int type,
-				char __user *optval, int optlen)
-{
-	u8 *val;
-	int rc = 0;
-
-	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+	if (opt.dccpsf_len < 1)
 		return -EINVAL;
 
-	val = kmalloc(optlen, GFP_KERNEL);
-	if (val == NULL)
+	val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
+	if (!val)
 		return -ENOMEM;
 
-	if (copy_from_user(val, optval, optlen)) {
-		kfree(val);
-		return -EFAULT;
+	if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
+		rc = -EFAULT;
+		goto out_free_val;
 	}
 
-	lock_sock(sk);
-	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
-		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+	rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat,
+			      val, opt.dccpsf_len, GFP_KERNEL);
+	if (rc)
+		goto out_free_val;
 
-	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
-		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
-	release_sock(sk);
+out:
+	return rc;
 
+out_free_val:
 	kfree(val);
-	return rc;
+	goto out;
 }
 
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -505,21 +510,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	struct dccp_sock *dp = dccp_sk(sk);
 	int val, err = 0;
 
-	switch (optname) {
-	case DCCP_SOCKOPT_PACKET_SIZE:
-		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
-		return 0;
-	case DCCP_SOCKOPT_CHANGE_L:
-	case DCCP_SOCKOPT_CHANGE_R:
-		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
-		return 0;
-	case DCCP_SOCKOPT_CCID:
-	case DCCP_SOCKOPT_RX_CCID:
-	case DCCP_SOCKOPT_TX_CCID:
-		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
-	}
-
-	if (optlen < (int)sizeof(int))
+	if (optlen < sizeof(int))
 		return -EINVAL;
 
 	if (get_user(val, (int __user *)optval))
@@ -530,38 +521,53 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 
 	lock_sock(sk);
 	switch (optname) {
+	case DCCP_SOCKOPT_PACKET_SIZE:
+		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+		err = 0;
+		break;
+	case DCCP_SOCKOPT_CHANGE_L:
+		if (optlen != sizeof(struct dccp_so_feat))
+			err = -EINVAL;
+		else
+			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
+						     (struct dccp_so_feat __user *)
+						     optval);
+		break;
+	case DCCP_SOCKOPT_CHANGE_R:
+		if (optlen != sizeof(struct dccp_so_feat))
+			err = -EINVAL;
+		else
+			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
+						     (struct dccp_so_feat __user *)
+						     optval);
+		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		if (dp->dccps_role != DCCP_ROLE_SERVER)
 			err = -EOPNOTSUPP;
 		else
 			dp->dccps_server_timewait = (val != 0);
 		break;
-	case DCCP_SOCKOPT_SEND_CSCOV:
-		err = dccp_setsockopt_cscov(sk, val, false);
-		break;
-	case DCCP_SOCKOPT_RECV_CSCOV:
-		err = dccp_setsockopt_cscov(sk, val, true);
-		break;
-	case DCCP_SOCKOPT_QPOLICY_ID:
-		if (sk->sk_state != DCCP_CLOSED)
-			err = -EISCONN;
-		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+	case DCCP_SOCKOPT_SEND_CSCOV:	/* sender side, RFC 4340, sec. 9.2 */
+		if (val < 0 || val > 15)
 			err = -EINVAL;
 		else
-			dp->dccps_qpolicy = val;
+			dp->dccps_pcslen = val;
 		break;
-	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
-		if (val < 0)
+	case DCCP_SOCKOPT_RECV_CSCOV:	/* receiver side, RFC 4340 sec. 9.2.1 */
+		if (val < 0 || val > 15)
 			err = -EINVAL;
-		else
-			dp->dccps_tx_qlen = val;
+		else {
+			dp->dccps_pcrlen = val;
+			/* FIXME: add feature negotiation,
+			 * ChangeL(MinimumChecksumCoverage, val) */
+		}
 		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
 	}
-	release_sock(sk);
 
+	release_sock(sk);
 	return err;
 }
 
@@ -642,18 +648,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_GET_CUR_MPS:
 		val = dp->dccps_mss_cache;
 		break;
-	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
-		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
-	case DCCP_SOCKOPT_TX_CCID:
-		val = ccid_get_current_tx_ccid(dp);
-		if (val < 0)
-			return -ENOPROTOOPT;
-		break;
-	case DCCP_SOCKOPT_RX_CCID:
-		val = ccid_get_current_rx_ccid(dp);
-		if (val < 0)
-			return -ENOPROTOOPT;
-		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		val = dp->dccps_server_timewait;
 		break;
@@ -663,12 +657,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_RECV_CSCOV:
 		val = dp->dccps_pcrlen;
 		break;
-	case DCCP_SOCKOPT_QPOLICY_ID:
-		val = dp->dccps_qpolicy;
-		break;
-	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
-		val = dp->dccps_tx_qlen;
-		break;
 	case 128 ... 191:
 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
 					     len, (u32 __user *)optval, optlen);
@@ -711,47 +699,6 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
 #endif
 
-static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
-{
-	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
-
-	/*
-	 * Assign an (opaque) qpolicy priority value to skb->priority.
-	 *
-	 * We are overloading this skb field for use with the qpolicy subystem.
-	 * The skb->priority is normally used for the SO_PRIORITY option, which
-	 * is initialised from sk_priority. Since the assignment of sk_priority
-	 * to skb->priority happens later (on layer 3), we overload this field
-	 * for use with queueing priorities as long as the skb is on layer 4.
-	 * The default priority value (if nothing is set) is 0.
-	 */
-	skb->priority = 0;
-
-	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
-
-		if (!CMSG_OK(msg, cmsg))
-			return -EINVAL;
-
-		if (cmsg->cmsg_level != SOL_DCCP)
-			continue;
-
-		if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
-		    !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
-			return -EINVAL;
-
-		switch (cmsg->cmsg_type) {
-		case DCCP_SCM_PRIORITY:
-			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
-				return -EINVAL;
-			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 size_t len)
 {
@@ -767,7 +714,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	lock_sock(sk);
 
-	if (dccp_qpolicy_full(sk)) {
+	if (sysctl_dccp_tx_qlen &&
+	    (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
 		rc = -EAGAIN;
 		goto out_release;
 	}
@@ -795,12 +743,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (rc != 0)
 		goto out_discard;
 
-	rc = dccp_msghdr_parse(msg, skb);
-	if (rc != 0)
-		goto out_discard;
-
-	dccp_qpolicy_push(sk, skb);
-	dccp_write_xmit(sk);
+	skb_queue_tail(&sk->sk_write_queue, skb);
+	dccp_write_xmit(sk,0);
 out_release:
 	release_sock(sk);
 	return rc ? : len;
@@ -1023,22 +967,9 @@ void dccp_close(struct sock *sk, long timeout)
 		/* Check zero linger _after_ checking for unread data. */
 		sk->sk_prot->disconnect(sk, 0);
 	} else if (sk->sk_state != DCCP_CLOSED) {
-		/*
-		 * Normal connection termination. May need to wait if there are
-		 * still packets in the TX queue that are delayed by the CCID.
-		 */
-		dccp_flush_write_queue(sk, &timeout);
 		dccp_terminate_connection(sk);
 	}
 
-	/*
-	 * Flush write queue. This may be necessary in several cases:
-	 * - we have been closed by the peer but still have application data;
-	 * - abortive termination (unread data or zero linger time),
-	 * - normal termination but queue could not be flushed within time limit
-	 */
-	__skb_queue_purge(&sk->sk_write_queue);
-
 	sk_stream_wait_close(sk, timeout);
 
 adjudge_to_death:
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
deleted file mode 100644
index 27383f88c75..00000000000
--- a/net/dccp/qpolicy.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- *  net/dccp/qpolicy.c
- *
- *  Policy-based packet dequeueing interface for DCCP.
- *
- *  Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License v2
- *  as published by the Free Software Foundation.
- */
-#include "dccp.h"
-
-/*
- *	Simple Dequeueing Policy:
- *	If tx_qlen is different from 0, enqueue up to tx_qlen elements.
- */
-static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
-{
-	skb_queue_tail(&sk->sk_write_queue, skb);
-}
-
-static bool qpolicy_simple_full(struct sock *sk)
-{
-	return dccp_sk(sk)->dccps_tx_qlen &&
-	       sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
-}
-
-static struct sk_buff *qpolicy_simple_top(struct sock *sk)
-{
-	return skb_peek(&sk->sk_write_queue);
-}
-
-/*
- *	Priority-based Dequeueing Policy:
- *	If tx_qlen is different from 0 and the queue has reached its upper bound
- *	of tx_qlen elements, replace older packets lowest-priority-first.
- */
-static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
-{
-	struct sk_buff *skb, *best = NULL;
-
-	skb_queue_walk(&sk->sk_write_queue, skb)
-		if (best == NULL || skb->priority > best->priority)
-			best = skb;
-	return best;
-}
-
-static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
-{
-	struct sk_buff *skb, *worst = NULL;
-
-	skb_queue_walk(&sk->sk_write_queue, skb)
-		if (worst == NULL || skb->priority < worst->priority)
-			worst = skb;
-	return worst;
-}
-
-static bool qpolicy_prio_full(struct sock *sk)
-{
-	if (qpolicy_simple_full(sk))
-		dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
-	return false;
-}
-
-/**
- * struct dccp_qpolicy_operations  -  TX Packet Dequeueing Interface
- * @push: add a new @skb to the write queue
- * @full: indicates that no more packets will be admitted
- * @top:  peeks at whatever the queueing policy defines as its `top'
- */
-static struct dccp_qpolicy_operations {
-	void		(*push)	(struct sock *sk, struct sk_buff *skb);
-	bool		(*full) (struct sock *sk);
-	struct sk_buff*	(*top)  (struct sock *sk);
-	__be32		params;
-
-} qpol_table[DCCPQ_POLICY_MAX] = {
-	[DCCPQ_POLICY_SIMPLE] = {
-		.push   = qpolicy_simple_push,
-		.full   = qpolicy_simple_full,
-		.top    = qpolicy_simple_top,
-		.params = 0,
-	},
-	[DCCPQ_POLICY_PRIO] = {
-		.push   = qpolicy_simple_push,
-		.full   = qpolicy_prio_full,
-		.top    = qpolicy_prio_best_skb,
-		.params = DCCP_SCM_PRIORITY,
-	},
-};
-
-/*
- *	Externally visible interface
- */
-void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
-{
-	qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
-}
-
-bool dccp_qpolicy_full(struct sock *sk)
-{
-	return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
-}
-
-void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
-{
-	if (skb != NULL) {
-		skb_unlink(skb, &sk->sk_write_queue);
-		kfree_skb(skb);
-	}
-}
-
-struct sk_buff *dccp_qpolicy_top(struct sock *sk)
-{
-	return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
-}
-
-struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
-{
-	struct sk_buff *skb = dccp_qpolicy_top(sk);
-
-	/* Clear any skb fields that we used internally */
-	skb->priority = 0;
-
-	if (skb)
-		skb_unlink(skb, &sk->sk_write_queue);
-	return skb;
-}
-
-bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
-{
-	/* check if exactly one bit is set */
-	if (!param || (param & (param - 1)))
-		return false;
-	return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
-}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index a5a1856234e..21295993fdb 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -18,72 +18,76 @@
 #error This file should not be compiled without CONFIG_SYSCTL defined
 #endif
 
-/* Boundary values */
-static int		zero     = 0,
-			u8_max   = 0xFF;
-static unsigned long	seqw_min = 32;
-
 static struct ctl_table dccp_default_table[] = {
 	{
 		.procname	= "seq_window",
-		.data		= &sysctl_dccp_sequence_window,
-		.maxlen		= sizeof(sysctl_dccp_sequence_window),
+		.data		= &sysctl_dccp_feat_sequence_window,
+		.maxlen		= sizeof(sysctl_dccp_feat_sequence_window),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &seqw_min,		/* RFC 4340, 7.5.2 */
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "rx_ccid",
-		.data		= &sysctl_dccp_rx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
+		.data		= &sysctl_dccp_feat_rx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_feat_rx_ccid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,		/* RFC 4340, 10. */
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "tx_ccid",
-		.data		= &sysctl_dccp_tx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
+		.data		= &sysctl_dccp_feat_tx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_feat_tx_ccid),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ack_ratio",
+		.data		= &sysctl_dccp_feat_ack_ratio,
+		.maxlen		= sizeof(sysctl_dccp_feat_ack_ratio),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "send_ackvec",
+		.data		= &sysctl_dccp_feat_send_ack_vector,
+		.maxlen		= sizeof(sysctl_dccp_feat_send_ack_vector),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "send_ndp",
+		.data		= &sysctl_dccp_feat_send_ndp_count,
+		.maxlen		= sizeof(sysctl_dccp_feat_send_ndp_count),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,		/* RFC 4340, 10. */
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
 		.maxlen		= sizeof(sysctl_dccp_request_retries),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "retries1",
 		.data		= &sysctl_dccp_retries1,
 		.maxlen		= sizeof(sysctl_dccp_retries1),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "retries2",
 		.data		= &sysctl_dccp_retries2,
 		.maxlen		= sizeof(sysctl_dccp_retries2),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "tx_qlen",
 		.data		= &sysctl_dccp_tx_qlen,
 		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "sync_ratelimit",
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 16359e29e7f..54b3c7e9e01 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -87,6 +87,17 @@ static void dccp_retransmit_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	/* retransmit timer is used for feature negotiation throughout
+	 * connection.  In this case, no packet is re-transmitted, but rather an
+	 * ack is generated and pending changes are placed into its options.
+	 */
+	if (sk->sk_send_head == NULL) {
+		dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
+		if (sk->sk_state == DCCP_OPEN)
+			dccp_send_ack(sk);
+		goto backoff;
+	}
+
 	/*
 	 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
 	 * sent, no need to retransmit, this sock is dead.
@@ -115,6 +126,7 @@ static void dccp_retransmit_timer(struct sock *sk)
 		return;
 	}
 
+backoff:
 	icsk->icsk_backoff++;
 
 	icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
@@ -237,35 +249,32 @@ out:
 	sock_put(sk);
 }
 
-/**
- * dccp_write_xmitlet  -  Workhorse for CCID packet dequeueing interface
- * See the comments above %ccid_dequeueing_decision for supported modes.
- */
-static void dccp_write_xmitlet(unsigned long data)
+/* Transmit-delay timer: used by the CCIDs to delay actual send time */
+static void dccp_write_xmit_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
+	struct dccp_sock *dp = dccp_sk(sk);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk))
-		sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
+		sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
 	else
-		dccp_write_xmit(sk);
+		dccp_write_xmit(sk, 0);
 	bh_unlock_sock(sk);
+	sock_put(sk);
 }
 
-static void dccp_write_xmit_timer(unsigned long data)
+static void dccp_init_write_xmit_timer(struct sock *sk)
 {
-	dccp_write_xmitlet(data);
-	sock_put((struct sock *)data);
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+			(unsigned long)sk);
 }
 
 void dccp_init_xmit_timers(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-
-	tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
-	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
-							     (unsigned long)sk);
+	dccp_init_write_xmit_timer(sk);
 	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
 				  &dccp_keepalive_timer);
 }
@@ -281,7 +290,8 @@ u32 dccp_timestamp(void)
 {
 	s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
 
-	return div_u64(delta, DCCP_TIME_RESOLUTION);
+	do_div(delta, 10);
+	return delta;
 }
 EXPORT_SYMBOL_GPL(dccp_timestamp);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9da9f19ece8..f79a5160729 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -811,12 +811,25 @@ void tcp_update_metrics(struct sock *sk)
 	}
 }
 
+/* Numbers are taken from RFC3390.
+ *
+ * John Heffner states:
+ *
+ *	The RFC specifies a window of no more than 4380 bytes
+ *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
+ *	is a bit misleading because they use a clamp at 4380 bytes
+ *	rather than use a multiplier in the relevant range.
+ */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
-	if (!cwnd)
-		cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
+	if (!cwnd) {
+		if (tp->mss_cache > 1460)
+			cwnd = 2;
+		else
+			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
+	}
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-- 
cgit v1.2.3


From 28faa979746b2352cd78a376bf9f52db953bda46 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Sep 2008 16:08:51 -0700
Subject: ipsec: Make xfrm_larval_drop default to 1.

The previous default behavior is definitely the least user
friendly.  Hanging there forever just because the keying
daemon is wedged or the refreshing of the policy can't move
forward is anti-social to say the least.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_policy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 46914b79d85..638bb5ff99a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -34,7 +34,7 @@
 
 #include "xfrm_hash.h"
 
-int sysctl_xfrm_larval_drop __read_mostly;
+int sysctl_xfrm_larval_drop __read_mostly = 1;
 
 #ifdef CONFIG_XFRM_STATISTICS
 DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics) __read_mostly;
-- 
cgit v1.2.3


From abb81c4f3cb9b8d421f1e5474811ef1d461d341c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 9 Sep 2008 19:58:29 -0700
Subject: ipsec: Use RCU-like construct for saved state within a walk

Now that we save states within a walk we need synchronisation
so that the list the saved state is on doesn't disappear from
under us.

As it stands this is done by keeping the state on the list which
is bad because it gets in the way of the management of the state
life-cycle.

An alternative is to make our own pseudo-RCU system where we use
counters to indicate which state can't be freed immediately as
it may be referenced by an ongoing walk when that resumes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_state.c | 52 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 0a8f09c3144..aaafcee02fc 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -59,6 +59,11 @@ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static unsigned int xfrm_state_num;
 static unsigned int xfrm_state_genid;
 
+/* Counter indicating ongoing walk, protected by xfrm_state_lock. */
+static unsigned long xfrm_state_walk_ongoing;
+/* Counter indicating walk completion, protected by xfrm_cfg_mutex. */
+static unsigned long xfrm_state_walk_completed;
+
 static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
 
@@ -191,7 +196,8 @@ static DEFINE_RWLOCK(xfrm_state_afinfo_lock);
 static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO];
 
 static struct work_struct xfrm_state_gc_work;
-static HLIST_HEAD(xfrm_state_gc_list);
+static LIST_HEAD(xfrm_state_gc_leftovers);
+static LIST_HEAD(xfrm_state_gc_list);
 static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 
 int __xfrm_state_delete(struct xfrm_state *x);
@@ -403,17 +409,22 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 
 static void xfrm_state_gc_task(struct work_struct *data)
 {
-	struct xfrm_state *x;
-	struct hlist_node *entry, *tmp;
-	struct hlist_head gc_list;
+	struct xfrm_state *x, *tmp;
+	unsigned long completed;
 
+	mutex_lock(&xfrm_cfg_mutex);
 	spin_lock_bh(&xfrm_state_gc_lock);
-	gc_list.first = xfrm_state_gc_list.first;
-	INIT_HLIST_HEAD(&xfrm_state_gc_list);
+	list_splice_tail_init(&xfrm_state_gc_list, &xfrm_state_gc_leftovers);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 
-	hlist_for_each_entry_safe(x, entry, tmp, &gc_list, bydst)
+	completed = xfrm_state_walk_completed;
+	mutex_unlock(&xfrm_cfg_mutex);
+
+	list_for_each_entry_safe(x, tmp, &xfrm_state_gc_leftovers, gclist) {
+		if ((long)(x->lastused - completed) > 0)
+			break;
 		xfrm_state_gc_destroy(x);
+	}
 
 	wake_up(&km_waitq);
 }
@@ -540,12 +551,8 @@ void __xfrm_state_destroy(struct xfrm_state *x)
 {
 	WARN_ON(x->km.state != XFRM_STATE_DEAD);
 
-	spin_lock_bh(&xfrm_state_lock);
-	list_del(&x->all);
-	spin_unlock_bh(&xfrm_state_lock);
-
 	spin_lock_bh(&xfrm_state_gc_lock);
-	hlist_add_head(&x->bydst, &xfrm_state_gc_list);
+	list_add_tail(&x->gclist, &xfrm_state_gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 	schedule_work(&xfrm_state_gc_work);
 }
@@ -558,6 +565,8 @@ int __xfrm_state_delete(struct xfrm_state *x)
 	if (x->km.state != XFRM_STATE_DEAD) {
 		x->km.state = XFRM_STATE_DEAD;
 		spin_lock(&xfrm_state_lock);
+		x->lastused = xfrm_state_walk_ongoing;
+		list_del_rcu(&x->all);
 		hlist_del(&x->bydst);
 		hlist_del(&x->bysrc);
 		if (x->id.spi)
@@ -1574,6 +1583,7 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
 			if (err) {
 				xfrm_state_hold(last);
 				walk->state = last;
+				xfrm_state_walk_ongoing++;
 				goto out;
 			}
 		}
@@ -1588,12 +1598,28 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
 		err = func(last, 0, data);
 out:
 	spin_unlock_bh(&xfrm_state_lock);
-	if (old != NULL)
+	if (old != NULL) {
 		xfrm_state_put(old);
+		xfrm_state_walk_completed++;
+		if (!list_empty(&xfrm_state_gc_leftovers))
+			schedule_work(&xfrm_state_gc_work);
+	}
 	return err;
 }
 EXPORT_SYMBOL(xfrm_state_walk);
 
+void xfrm_state_walk_done(struct xfrm_state_walk *walk)
+{
+	if (walk->state != NULL) {
+		xfrm_state_put(walk->state);
+		walk->state = NULL;
+		xfrm_state_walk_completed++;
+		if (!list_empty(&xfrm_state_gc_leftovers))
+			schedule_work(&xfrm_state_gc_work);
+	}
+}
+EXPORT_SYMBOL(xfrm_state_walk_done);
+
 
 void xfrm_replay_notify(struct xfrm_state *x, int event)
 {
-- 
cgit v1.2.3


From 08569908fffec3625e29eec7cf7577eaa512e719 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Sep 2008 22:13:28 -0700
Subject: ipsec: Add missing list_del() in xfrm_state_gc_task().

Otherwise entries stay on the GC todo list forever, even after we free
them.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_state.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index aaafcee02fc..abbe2702c40 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -423,6 +423,7 @@ static void xfrm_state_gc_task(struct work_struct *data)
 	list_for_each_entry_safe(x, tmp, &xfrm_state_gc_leftovers, gclist) {
 		if ((long)(x->lastused - completed) > 0)
 			break;
+		list_del(&x->gclist);
 		xfrm_state_gc_destroy(x);
 	}
 
-- 
cgit v1.2.3


From 1e493d1946a0b26b79001c18d7312d536156ff5a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 10 Sep 2008 17:27:15 -0700
Subject: ipv6: On interface down/unregister, purge icmp routes too.

Johannes Berg reported that occaisionally, bringing an interface
down or unregistering it would hang for up to 30 seconds.  Using
debugging output he provided it became clear that ICMP6 routes
were the culprit.

The problem is that ICMP6 routes live in their own world totally
separate from normal ipv6 routes.  So there are all kinds of special
cases throughout the ipv6 code to handle this.

While we should really try to unify all of this stuff somehow,
for the time being let's fix this by purging the ICMP6 routes
that match the device in question during rt6_ifdown().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9af6115f0f5..776871ee228 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1003,6 +1003,25 @@ int icmp6_dst_gc(void)
 	return more;
 }
 
+static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
+			    void *arg)
+{
+	struct dst_entry *dst, **pprev;
+
+	spin_lock_bh(&icmp6_dst_lock);
+	pprev = &icmp6_dst_gc_list;
+	while ((dst = *pprev) != NULL) {
+		struct rt6_info *rt = (struct rt6_info *) dst;
+		if (func(rt, arg)) {
+			*pprev = dst->next;
+			dst_free(dst);
+		} else {
+			pprev = &dst->next;
+		}
+	}
+	spin_unlock_bh(&icmp6_dst_lock);
+}
+
 static int ip6_dst_gc(struct dst_ops *ops)
 {
 	unsigned long now = jiffies;
@@ -1930,6 +1949,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
 	};
 
 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
+	icmp6_clean_all(fib6_ifdown, &adn);
 }
 
 struct rt6_mtu_change_arg
-- 
cgit v1.2.3


From a40c24a13366e324bc0ff8c3bb107db89312c984 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 11 Sep 2008 04:51:14 -0700
Subject: net: Add SKB DMA mapping helper functions.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/Makefile      |  1 +
 net/core/skb_dma_map.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 net/core/skb_dma_map.c

(limited to 'net')

diff --git a/net/core/Makefile b/net/core/Makefile
index b1332f6d004..26a37cb3192 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -6,6 +6,7 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
 	 gen_stats.o gen_estimator.o net_namespace.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
+obj-$(CONFIG_HAS_DMA) += skb_dma_map.o
 
 obj-y		     += dev.o ethtool.o dev_mcast.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o
diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c
new file mode 100644
index 00000000000..1f49afcd8e8
--- /dev/null
+++ b/net/core/skb_dma_map.c
@@ -0,0 +1,66 @@
+/* skb_dma_map.c: DMA mapping helpers for socket buffers.
+ *
+ * Copyright (C) David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <linux/skbuff.h>
+
+int skb_dma_map(struct device *dev, struct sk_buff *skb,
+		enum dma_data_direction dir)
+{
+	struct skb_shared_info *sp = skb_shinfo(skb);
+	dma_addr_t map;
+	int i;
+
+	map = dma_map_single(dev, skb->data,
+			     skb_headlen(skb), dir);
+	if (dma_mapping_error(dev, map))
+		goto out_err;
+
+	sp->dma_maps[0] = map;
+	for (i = 0; i < sp->nr_frags; i++) {
+		skb_frag_t *fp = &sp->frags[i];
+
+		map = dma_map_page(dev, fp->page, fp->page_offset,
+				   fp->size, dir);
+		if (dma_mapping_error(dev, map))
+			goto unwind;
+		sp->dma_maps[i + 1] = map;
+	}
+	sp->num_dma_maps = i + 1;
+
+	return 0;
+
+unwind:
+	while (i-- >= 0) {
+		skb_frag_t *fp = &sp->frags[i];
+
+		dma_unmap_page(dev, sp->dma_maps[i + 1],
+			       fp->size, dir);
+	}
+	dma_unmap_single(dev, sp->dma_maps[0],
+			 skb_headlen(skb), dir);
+out_err:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(skb_dma_map);
+
+void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
+		   enum dma_data_direction dir)
+{
+	struct skb_shared_info *sp = skb_shinfo(skb);
+	int i;
+
+	dma_unmap_single(dev, sp->dma_maps[0],
+			 skb_headlen(skb), dir);
+	for (i = 0; i < sp->nr_frags; i++) {
+		skb_frag_t *fp = &sp->frags[i];
+
+		dma_unmap_page(dev, sp->dma_maps[i + 1],
+			       fp->size, dir);
+	}
+}
+EXPORT_SYMBOL(skb_dma_unmap);
-- 
cgit v1.2.3


From 00c5ae2fa0f8191a1b204e71f0ee11359e3b2c06 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Wed, 3 Sep 2008 11:26:42 +0800
Subject: mac80211: change MIMO_PS to SM_PS

This patch follows 11n spec naming more rigorously replacing MIMO_PS
with SM_PS (Spatial Multiplexing Power Save).

(Originally submitted as 4 patches, "mac80211: change MIMO_PS to SM_PS",
"iwlwifi: change MIMO_PS to SM_PS", "ath9k: change MIMO_PS to SM_PS",
and "iwlwifi: remove double definition of SM PS". -- JWL)

Signed-off-by: Ron Rindjunsky <ron.rindjunsky@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 396cfb2d0f4..7dc06319725 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1140,8 +1140,8 @@ u32 ieee80211_handle_ht(struct ieee80211_local *local, int enable_ht,
 	ht_conf.ht_supported = 1;
 
 	ht_conf.cap = req_ht_cap->cap & sband->ht_info.cap;
-	ht_conf.cap &= ~(IEEE80211_HT_CAP_MIMO_PS);
-	ht_conf.cap |= sband->ht_info.cap & IEEE80211_HT_CAP_MIMO_PS;
+	ht_conf.cap &= ~(IEEE80211_HT_CAP_SM_PS);
+	ht_conf.cap |= sband->ht_info.cap & IEEE80211_HT_CAP_SM_PS;
 	ht_bss_conf.primary_channel = req_bss_cap->primary_channel;
 	ht_bss_conf.bss_cap = req_bss_cap->bss_cap;
 	ht_bss_conf.bss_op_mode = req_bss_cap->bss_op_mode;
-- 
cgit v1.2.3


From 69e6c010fd5f5015d3cc64718fbe266face93770 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 11:05:08 +0200
Subject: mac80211: move some RCU locking into an if branch

The if itself doesn't need to be protected, so move in the RCU
locking to avoid doing anything at all when the condition isn't
true.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index df12e746b03..0abd5a4fe38 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2612,13 +2612,12 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				      mesh_peer_accepts_plinks(elems));
 	}
 
-	rcu_read_lock();
-
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && elems->supp_rates &&
 	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
-
 		supp_rates = ieee80211_sta_get_rates(local, elems, band);
 
+		rcu_read_lock();
+
 		sta = sta_info_get(local, mgmt->sa);
 		if (sta) {
 			u64 prev_rates;
@@ -2642,9 +2641,9 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 			ieee80211_ibss_add_sta(sdata, NULL, mgmt->bssid,
 					       mgmt->sa, supp_rates);
 		}
-	}
 
-	rcu_read_unlock();
+		rcu_read_unlock();
+	}
 
 	if (elems->ds_params && elems->ds_params_len == 1)
 		freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
-- 
cgit v1.2.3


From fe3fa827314b877486c515a001c3e6f604f6f16f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 11:05:09 +0200
Subject: mac80211: make conf_tx non-atomic

The conf_tx callback currently needs to be atomic, this requirement
is just because it can be called from scanning. This rearranges it
slightly to only update while not scanning (which is fine, we'll be
getting beacons when associated) and thus removes the atomic
requirement.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0abd5a4fe38..a03245255ed 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2872,15 +2872,18 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	    memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
 		return;
 
-	ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
-				 elems.wmm_param_len);
-
 	/* Do not send changes to driver if we are scanning. This removes
-	 * requirement that driver's bss_info_changed function needs to be
-	 * atomic. */
+	 * requirement that a driver's bss_info_changed/conf_tx functions
+	 * need to be atomic.
+	 * This is really ugly code, we should rewrite scanning and make
+	 * all this more understandable for humans.
+	 */
 	if (local->sta_sw_scanning || local->sta_hw_scanning)
 		return;
 
+	ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
+				 elems.wmm_param_len);
+
 	if (elems.erp_info && elems.erp_info_len >= 1)
 		changed |= ieee80211_handle_erp_ie(sdata, elems.erp_info[0]);
 	else {
-- 
cgit v1.2.3


From 5bda617576e58c7213aef5ab90383f303727b5b1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 11:05:10 +0200
Subject: mac80211: BSS info: check channel first

When we receive information about a BSS we check at some point
whether or not we think we're allowed to use the channel it is
on, but we do that fairly late. I don't think we should do it
that late, so do it earlier to avoid doing IBSS/mesh stuff on
that channel and then getting confused because it's disabled.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a03245255ed..ae97d7e9945 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2602,7 +2602,15 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	DECLARE_MAC_BUF(mac);
 	DECLARE_MAC_BUF(mac2);
 
-	beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
+	if (elems->ds_params && elems->ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return;
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) && elems->mesh_id &&
 	    elems->mesh_config && mesh_matches_local(elems, sdata)) {
@@ -2645,16 +2653,6 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		rcu_read_unlock();
 	}
 
-	if (elems->ds_params && elems->ds_params_len == 1)
-		freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
-	else
-		freq = rx_status->freq;
-
-	channel = ieee80211_get_channel(local->hw.wiphy, freq);
-
-	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
-		return;
-
 #ifdef CONFIG_MAC80211_MESH
 	if (elems->mesh_config)
 		bss = ieee80211_rx_mesh_bss_get(local, elems->mesh_id,
@@ -2723,6 +2721,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 
 	bss->band = band;
 
+	beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
+
 	bss->timestamp = beacon_timestamp;
 	bss->last_update = jiffies;
 	bss->signal = rx_status->signal;
-- 
cgit v1.2.3


From 9c80d3dc272ec5ce44a7564e5392f950ad38357a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 15:41:59 +0200
Subject: mac80211: fix action frame length checks

The action frame length checks are one too small, there's not just
an action code as the comment makes you believe, there's a category
code too, and the category code is required in each action frame
(hence part of IEEE80211_MIN_ACTION_SIZE).

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mesh_hwmp.c  | 4 ++++
 net/mac80211/mesh_plink.c | 4 ++++
 net/mac80211/mlme.c       | 5 +++--
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index eeb0ce2d5d3..59fd7fe377e 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -581,6 +581,10 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
 	size_t baselen;
 	u32 last_hop_metric;
 
+	/* need action_code */
+	if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+		return;
+
 	baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt;
 	ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable,
 			len - baselen, &elems);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 7714b0e6e4d..74983cfa729 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -421,6 +421,10 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 	DECLARE_MAC_BUF(mac);
 #endif
 
+	/* need action_code, aux */
+	if (len < IEEE80211_MIN_ACTION_SIZE + 3)
+		return;
+
 	if (is_multicast_ether_addr(mgmt->da)) {
 		mpl_dbg("Mesh plink: ignore frame from multicast address");
 		return;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ae97d7e9945..eb1832aa1fe 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -60,7 +60,7 @@
 
 #define ERP_INFO_USE_PROTECTION BIT(1)
 
-/* mgmt header + 1 byte action code */
+/* mgmt header + 1 byte category code */
 #define IEEE80211_MIN_ACTION_SIZE (24 + 1)
 
 #define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002
@@ -2988,7 +2988,8 @@ static void ieee80211_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 
-	if (len < IEEE80211_MIN_ACTION_SIZE)
+	/* all categories we currently handle have action_code */
+	if (len < IEEE80211_MIN_ACTION_SIZE + 1)
 		return;
 
 	switch (mgmt->u.action.category) {
-- 
cgit v1.2.3


From 5fd12d4da198647e834f93f163e20bfcdd33bad8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 16:31:48 +0200
Subject: mac80211: fix typo in action frame handling

This says chan_switch.action_code but really means
measurement.action_code, of course the actual offset in
the frame is the same, it's just harder to understand
this way.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index eb1832aa1fe..2341e5b3c24 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2996,7 +2996,7 @@ static void ieee80211_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
 	case WLAN_CATEGORY_SPECTRUM_MGMT:
 		if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
 			break;
-		switch (mgmt->u.action.u.chan_switch.action_code) {
+		switch (mgmt->u.action.u.measurement.action_code) {
 		case WLAN_ACTION_SPCT_MSR_REQ:
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.measurement)))
-- 
cgit v1.2.3


From 37ffc8da803a1151e887f2a80f08f0c49d1dc1d5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 16:40:36 +0200
Subject: mac80211: move IE parsing to util file

Since IE parsing is required for the mlme and mesh code, it's
not a static function anyway, and it's much better to have it
in util rather than the overly large mlme.c

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 145 ----------------------------------------------------
 net/mac80211/util.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 145 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2341e5b3c24..b03f1f3ef2e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -91,151 +91,6 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 static void sta_rx_agg_session_timer_expired(unsigned long data);
 
 
-void ieee802_11_parse_elems(u8 *start, size_t len,
-			    struct ieee802_11_elems *elems)
-{
-	size_t left = len;
-	u8 *pos = start;
-
-	memset(elems, 0, sizeof(*elems));
-	elems->ie_start = start;
-	elems->total_len = len;
-
-	while (left >= 2) {
-		u8 id, elen;
-
-		id = *pos++;
-		elen = *pos++;
-		left -= 2;
-
-		if (elen > left)
-			return;
-
-		switch (id) {
-		case WLAN_EID_SSID:
-			elems->ssid = pos;
-			elems->ssid_len = elen;
-			break;
-		case WLAN_EID_SUPP_RATES:
-			elems->supp_rates = pos;
-			elems->supp_rates_len = elen;
-			break;
-		case WLAN_EID_FH_PARAMS:
-			elems->fh_params = pos;
-			elems->fh_params_len = elen;
-			break;
-		case WLAN_EID_DS_PARAMS:
-			elems->ds_params = pos;
-			elems->ds_params_len = elen;
-			break;
-		case WLAN_EID_CF_PARAMS:
-			elems->cf_params = pos;
-			elems->cf_params_len = elen;
-			break;
-		case WLAN_EID_TIM:
-			elems->tim = pos;
-			elems->tim_len = elen;
-			break;
-		case WLAN_EID_IBSS_PARAMS:
-			elems->ibss_params = pos;
-			elems->ibss_params_len = elen;
-			break;
-		case WLAN_EID_CHALLENGE:
-			elems->challenge = pos;
-			elems->challenge_len = elen;
-			break;
-		case WLAN_EID_WPA:
-			if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 &&
-			    pos[2] == 0xf2) {
-				/* Microsoft OUI (00:50:F2) */
-				if (pos[3] == 1) {
-					/* OUI Type 1 - WPA IE */
-					elems->wpa = pos;
-					elems->wpa_len = elen;
-				} else if (elen >= 5 && pos[3] == 2) {
-					if (pos[4] == 0) {
-						elems->wmm_info = pos;
-						elems->wmm_info_len = elen;
-					} else if (pos[4] == 1) {
-						elems->wmm_param = pos;
-						elems->wmm_param_len = elen;
-					}
-				}
-			}
-			break;
-		case WLAN_EID_RSN:
-			elems->rsn = pos;
-			elems->rsn_len = elen;
-			break;
-		case WLAN_EID_ERP_INFO:
-			elems->erp_info = pos;
-			elems->erp_info_len = elen;
-			break;
-		case WLAN_EID_EXT_SUPP_RATES:
-			elems->ext_supp_rates = pos;
-			elems->ext_supp_rates_len = elen;
-			break;
-		case WLAN_EID_HT_CAPABILITY:
-			elems->ht_cap_elem = pos;
-			elems->ht_cap_elem_len = elen;
-			break;
-		case WLAN_EID_HT_EXTRA_INFO:
-			elems->ht_info_elem = pos;
-			elems->ht_info_elem_len = elen;
-			break;
-		case WLAN_EID_MESH_ID:
-			elems->mesh_id = pos;
-			elems->mesh_id_len = elen;
-			break;
-		case WLAN_EID_MESH_CONFIG:
-			elems->mesh_config = pos;
-			elems->mesh_config_len = elen;
-			break;
-		case WLAN_EID_PEER_LINK:
-			elems->peer_link = pos;
-			elems->peer_link_len = elen;
-			break;
-		case WLAN_EID_PREQ:
-			elems->preq = pos;
-			elems->preq_len = elen;
-			break;
-		case WLAN_EID_PREP:
-			elems->prep = pos;
-			elems->prep_len = elen;
-			break;
-		case WLAN_EID_PERR:
-			elems->perr = pos;
-			elems->perr_len = elen;
-			break;
-		case WLAN_EID_CHANNEL_SWITCH:
-			elems->ch_switch_elem = pos;
-			elems->ch_switch_elem_len = elen;
-			break;
-		case WLAN_EID_QUIET:
-			if (!elems->quiet_elem) {
-				elems->quiet_elem = pos;
-				elems->quiet_elem_len = elen;
-			}
-			elems->num_of_quiet_elem++;
-			break;
-		case WLAN_EID_COUNTRY:
-			elems->country_elem = pos;
-			elems->country_elem_len = elen;
-			break;
-		case WLAN_EID_PWR_CONSTRAINT:
-			elems->pwr_constr_elem = pos;
-			elems->pwr_constr_elem_len = elen;
-			break;
-		default:
-			break;
-		}
-
-		left -= elen;
-		pos += elen;
-	}
-}
-
-
 static u8 * ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
 {
 	u8 *end, *pos;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f40c060341a..e19c74cada3 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -428,3 +428,147 @@ void ieee80211_iterate_active_interfaces_atomic(
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic);
+
+void ieee802_11_parse_elems(u8 *start, size_t len,
+			    struct ieee802_11_elems *elems)
+{
+	size_t left = len;
+	u8 *pos = start;
+
+	memset(elems, 0, sizeof(*elems));
+	elems->ie_start = start;
+	elems->total_len = len;
+
+	while (left >= 2) {
+		u8 id, elen;
+
+		id = *pos++;
+		elen = *pos++;
+		left -= 2;
+
+		if (elen > left)
+			return;
+
+		switch (id) {
+		case WLAN_EID_SSID:
+			elems->ssid = pos;
+			elems->ssid_len = elen;
+			break;
+		case WLAN_EID_SUPP_RATES:
+			elems->supp_rates = pos;
+			elems->supp_rates_len = elen;
+			break;
+		case WLAN_EID_FH_PARAMS:
+			elems->fh_params = pos;
+			elems->fh_params_len = elen;
+			break;
+		case WLAN_EID_DS_PARAMS:
+			elems->ds_params = pos;
+			elems->ds_params_len = elen;
+			break;
+		case WLAN_EID_CF_PARAMS:
+			elems->cf_params = pos;
+			elems->cf_params_len = elen;
+			break;
+		case WLAN_EID_TIM:
+			elems->tim = pos;
+			elems->tim_len = elen;
+			break;
+		case WLAN_EID_IBSS_PARAMS:
+			elems->ibss_params = pos;
+			elems->ibss_params_len = elen;
+			break;
+		case WLAN_EID_CHALLENGE:
+			elems->challenge = pos;
+			elems->challenge_len = elen;
+			break;
+		case WLAN_EID_WPA:
+			if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 &&
+			    pos[2] == 0xf2) {
+				/* Microsoft OUI (00:50:F2) */
+				if (pos[3] == 1) {
+					/* OUI Type 1 - WPA IE */
+					elems->wpa = pos;
+					elems->wpa_len = elen;
+				} else if (elen >= 5 && pos[3] == 2) {
+					if (pos[4] == 0) {
+						elems->wmm_info = pos;
+						elems->wmm_info_len = elen;
+					} else if (pos[4] == 1) {
+						elems->wmm_param = pos;
+						elems->wmm_param_len = elen;
+					}
+				}
+			}
+			break;
+		case WLAN_EID_RSN:
+			elems->rsn = pos;
+			elems->rsn_len = elen;
+			break;
+		case WLAN_EID_ERP_INFO:
+			elems->erp_info = pos;
+			elems->erp_info_len = elen;
+			break;
+		case WLAN_EID_EXT_SUPP_RATES:
+			elems->ext_supp_rates = pos;
+			elems->ext_supp_rates_len = elen;
+			break;
+		case WLAN_EID_HT_CAPABILITY:
+			elems->ht_cap_elem = pos;
+			elems->ht_cap_elem_len = elen;
+			break;
+		case WLAN_EID_HT_EXTRA_INFO:
+			elems->ht_info_elem = pos;
+			elems->ht_info_elem_len = elen;
+			break;
+		case WLAN_EID_MESH_ID:
+			elems->mesh_id = pos;
+			elems->mesh_id_len = elen;
+			break;
+		case WLAN_EID_MESH_CONFIG:
+			elems->mesh_config = pos;
+			elems->mesh_config_len = elen;
+			break;
+		case WLAN_EID_PEER_LINK:
+			elems->peer_link = pos;
+			elems->peer_link_len = elen;
+			break;
+		case WLAN_EID_PREQ:
+			elems->preq = pos;
+			elems->preq_len = elen;
+			break;
+		case WLAN_EID_PREP:
+			elems->prep = pos;
+			elems->prep_len = elen;
+			break;
+		case WLAN_EID_PERR:
+			elems->perr = pos;
+			elems->perr_len = elen;
+			break;
+		case WLAN_EID_CHANNEL_SWITCH:
+			elems->ch_switch_elem = pos;
+			elems->ch_switch_elem_len = elen;
+			break;
+		case WLAN_EID_QUIET:
+			if (!elems->quiet_elem) {
+				elems->quiet_elem = pos;
+				elems->quiet_elem_len = elen;
+			}
+			elems->num_of_quiet_elem++;
+			break;
+		case WLAN_EID_COUNTRY:
+			elems->country_elem = pos;
+			elems->country_elem_len = elen;
+			break;
+		case WLAN_EID_PWR_CONSTRAINT:
+			elems->pwr_constr_elem = pos;
+			elems->pwr_constr_elem_len = elen;
+			break;
+		default:
+			break;
+		}
+
+		left -= elen;
+		pos += elen;
+	}
+}
-- 
cgit v1.2.3


From aa458d1737c3cc9a7c90ea9c5ef1ee6d663fba71 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Tue, 9 Sep 2008 00:32:12 +0300
Subject: mac80211: restructure disassoc/deauth flows

This patch restructure the flow of disassociation and deauthentication
flows to be consistent under all circumstances.
It ensures that BA session is treated down before deauthentication or disassociation,
adds the removal of the obsolete sta form station table and fixes a related bug (sta_info_destroy
without sta_info_unlink) in ieee80211_associated()
and reduce some code duplication

Signed-off-by: Ron Rindjunsky <ron.rindjunsky@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 85 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b03f1f3ef2e..f7a390ff967 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -415,8 +415,6 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 		memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN);
 		ieee80211_sta_send_associnfo(sdata, ifsta);
 	} else {
-		netif_carrier_off(sdata->dev);
-		ieee80211_sta_tear_down_BA_sessions(sdata, ifsta->bssid);
 		ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
 		changed |= ieee80211_reset_erp_info(sdata);
 
@@ -439,18 +437,6 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
 }
 
-static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta, int deauth)
-{
-	if (deauth) {
-		ifsta->direct_probe_tries = 0;
-		ifsta->auth_tries = 0;
-	}
-	ifsta->assoc_scan_tries = 0;
-	ifsta->assoc_tries = 0;
-	ieee80211_set_associated(sdata, ifsta, 0);
-}
-
 void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt)
 {
@@ -844,6 +830,50 @@ static void ieee80211_send_disassoc(struct ieee80211_sub_if_data *sdata,
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
+static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_if_sta *ifsta, bool deauth,
+				   bool self_disconnected, u16 reason)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	rcu_read_lock();
+
+	sta = sta_info_get(local, ifsta->bssid);
+	if (!sta) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (deauth) {
+		ifsta->direct_probe_tries = 0;
+		ifsta->auth_tries = 0;
+	}
+	ifsta->assoc_scan_tries = 0;
+	ifsta->assoc_tries = 0;
+
+	netif_carrier_off(sdata->dev);
+
+	ieee80211_sta_tear_down_BA_sessions(sdata, sta->addr);
+
+	if (self_disconnected) {
+		if (deauth)
+			ieee80211_send_deauth(sdata, ifsta, reason);
+		else
+			ieee80211_send_disassoc(sdata, ifsta, reason);
+	}
+
+	ieee80211_set_associated(sdata, ifsta, 0);
+
+	if (self_disconnected)
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
+
+	sta_info_unlink(&sta);
+
+	rcu_read_unlock();
+
+	sta_info_destroy(sta);
+}
 
 static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_if_sta *ifsta)
@@ -938,7 +968,6 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 				       "range\n",
 				       sdata->dev->name, print_mac(mac, ifsta->bssid));
 				disassoc = 1;
-				sta_info_unlink(&sta);
 			} else
 				ieee80211_send_probe_req(sdata, ifsta->bssid,
 							 local->scan_ssid,
@@ -958,16 +987,12 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 
 	rcu_read_unlock();
 
-	if (disassoc && sta)
-		sta_info_destroy(sta);
-
-	if (disassoc) {
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
-		ieee80211_set_associated(sdata, ifsta, 0);
-	} else {
+	if (disassoc)
+		ieee80211_set_disassoc(sdata, ifsta, true, true,
+					WLAN_REASON_PREV_AUTH_NOT_VALID);
+	else
 		mod_timer(&ifsta->timer, jiffies +
 				      IEEE80211_MONITORING_INTERVAL);
-	}
 }
 
 
@@ -1832,7 +1857,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
 
-	ieee80211_set_disassoc(sdata, ifsta, 1);
+	ieee80211_set_disassoc(sdata, ifsta, true, false, 0);
 	ifsta->flags &= ~IEEE80211_STA_AUTHENTICATED;
 }
 
@@ -1862,7 +1887,7 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
 
-	ieee80211_set_disassoc(sdata, ifsta, 0);
+	ieee80211_set_disassoc(sdata, ifsta, false, false, 0);
 }
 
 
@@ -3200,8 +3225,8 @@ void ieee80211_sta_work(struct work_struct *work)
 		printk(KERN_DEBUG "%s: privacy configuration mismatch and "
 		       "mixed-cell disabled - disassociate\n", sdata->dev->name);
 
-		ieee80211_send_disassoc(sdata, ifsta, WLAN_REASON_UNSPECIFIED);
-		ieee80211_set_disassoc(sdata, ifsta, 0);
+		ieee80211_set_disassoc(sdata, ifsta, false, true,
+					WLAN_REASON_UNSPECIFIED);
 	}
 }
 
@@ -4236,8 +4261,7 @@ int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason
 	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
 		return -EINVAL;
 
-	ieee80211_send_deauth(sdata, ifsta, reason);
-	ieee80211_set_disassoc(sdata, ifsta, 1);
+	ieee80211_set_disassoc(sdata, ifsta, true, true, reason);
 	return 0;
 }
 
@@ -4255,8 +4279,7 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 	if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED))
 		return -1;
 
-	ieee80211_send_disassoc(sdata, ifsta, reason);
-	ieee80211_set_disassoc(sdata, ifsta, 0);
+	ieee80211_set_disassoc(sdata, ifsta, false, true, reason);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3b7ee69d0caefbdb85a606a98bff841b8c63b97e Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Mon, 8 Sep 2008 17:33:38 +0200
Subject: mac80211: disassociate when moving to new BSS

This patch makes the MLME cleanly disassociate from the current BSS
when leaving it for a new one. This is not just nicer to the old AP
(we're leaving it, might as well tell it!) but also required for some
drivers that keep track of the station we're associated with, they'd
get confused because they'd think we are associated with two APs.

Signed-off-by: Ron Rindjunsky <ron.rindjunsky@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f7a390ff967..eababf320b8 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3271,9 +3271,14 @@ void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	if ((ifsta->flags & (IEEE80211_STA_BSSID_SET |
-				IEEE80211_STA_AUTO_BSSID_SEL)) &&
+			     IEEE80211_STA_AUTO_BSSID_SEL)) &&
 	    (ifsta->flags & (IEEE80211_STA_SSID_SET |
-				IEEE80211_STA_AUTO_SSID_SEL))) {
+			     IEEE80211_STA_AUTO_SSID_SEL))) {
+
+		if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED)
+			ieee80211_set_disassoc(sdata, ifsta, true, true,
+					       WLAN_REASON_DEAUTH_LEAVING);
+
 		set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
 		queue_work(local->hw.workqueue, &ifsta->work);
 	}
-- 
cgit v1.2.3


From f5e5bf258b399f74b606e532ae0a2599522fd7bf Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Mon, 8 Sep 2008 17:33:39 +0200
Subject: mac80211: remove disassociation code from ieee80211_set_associated

This patch moves disassociation code from ieee80211_set_associated
to ieee80211_set_disassoc. To reduce code duplication, it introduces
the ieee80211_sta_send_apinfo function. Additionally, it fixes a lapse
where BSS_CHANGED_HT wasn't set when notifying the driver of changes
due to disassociation.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 102 ++++++++++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index eababf320b8..0e9bd840e7d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -352,6 +352,17 @@ int ieee80211_ht_addt_info_ie_to_ht_bss_info(
 	return 0;
 }
 
+static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_if_sta *ifsta)
+{
+	union iwreq_data wrqu;
+	memset(&wrqu, 0, sizeof(wrqu));
+	if (ifsta->flags & IEEE80211_STA_ASSOCIATED)
+		memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN);
+	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
+}
+
 static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_if_sta *ifsta)
 {
@@ -373,68 +384,53 @@ static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
 
 
 static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta,
-				     bool assoc)
+				     struct ieee80211_if_sta *ifsta)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_conf *conf = &local_to_hw(local)->conf;
-	union iwreq_data wrqu;
 	u32 changed = BSS_CHANGED_ASSOC;
 
-	if (assoc) {
-		struct ieee80211_sta_bss *bss;
-
-		ifsta->flags |= IEEE80211_STA_ASSOCIATED;
+	struct ieee80211_sta_bss *bss;
 
-		if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
-			return;
+	ifsta->flags |= IEEE80211_STA_ASSOCIATED;
 
-		bss = ieee80211_rx_bss_get(local, ifsta->bssid,
-					   conf->channel->center_freq,
-					   ifsta->ssid, ifsta->ssid_len);
-		if (bss) {
-			/* set timing information */
-			sdata->bss_conf.beacon_int = bss->beacon_int;
-			sdata->bss_conf.timestamp = bss->timestamp;
-			sdata->bss_conf.dtim_period = bss->dtim_period;
+	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+		return;
 
-			changed |= ieee80211_handle_bss_capability(sdata, bss);
+	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+				   conf->channel->center_freq,
+				   ifsta->ssid, ifsta->ssid_len);
+	if (bss) {
+		/* set timing information */
+		sdata->bss_conf.beacon_int = bss->beacon_int;
+		sdata->bss_conf.timestamp = bss->timestamp;
+		sdata->bss_conf.dtim_period = bss->dtim_period;
 
-			ieee80211_rx_bss_put(local, bss);
-		}
+		changed |= ieee80211_handle_bss_capability(sdata, bss);
 
-		if (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) {
-			changed |= BSS_CHANGED_HT;
-			sdata->bss_conf.assoc_ht = 1;
-			sdata->bss_conf.ht_conf = &conf->ht_conf;
-			sdata->bss_conf.ht_bss_conf = &conf->ht_bss_conf;
-		}
+		ieee80211_rx_bss_put(local, bss);
+	}
 
-		ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
-		memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
-		memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN);
-		ieee80211_sta_send_associnfo(sdata, ifsta);
-	} else {
-		ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
-		changed |= ieee80211_reset_erp_info(sdata);
+	if (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) {
+		changed |= BSS_CHANGED_HT;
+		sdata->bss_conf.assoc_ht = 1;
+		sdata->bss_conf.ht_conf = &conf->ht_conf;
+		sdata->bss_conf.ht_bss_conf = &conf->ht_bss_conf;
+	}
 
-		sdata->bss_conf.assoc_ht = 0;
-		sdata->bss_conf.ht_conf = NULL;
-		sdata->bss_conf.ht_bss_conf = NULL;
+	ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
+	memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
+	ieee80211_sta_send_associnfo(sdata, ifsta);
 
-		memset(wrqu.ap_addr.sa_data, 0, ETH_ALEN);
-	}
 	ifsta->last_probe = jiffies;
-	ieee80211_led_assoc(local, assoc);
+	ieee80211_led_assoc(local, 1);
 
-	sdata->bss_conf.assoc = assoc;
+	sdata->bss_conf.assoc = 1;
 	ieee80211_bss_info_change_notify(sdata, changed);
 
-	if (assoc)
-		netif_carrier_on(sdata->dev);
+	netif_carrier_on(sdata->dev);
 
-	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
-	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
+	ieee80211_sta_send_apinfo(sdata, ifsta);
 }
 
 void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
@@ -836,6 +832,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
+	u32 changed = BSS_CHANGED_ASSOC;
 
 	rcu_read_lock();
 
@@ -863,7 +860,20 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 			ieee80211_send_disassoc(sdata, ifsta, reason);
 	}
 
-	ieee80211_set_associated(sdata, ifsta, 0);
+	ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
+	changed |= ieee80211_reset_erp_info(sdata);
+
+	if (sdata->bss_conf.assoc_ht)
+		changed |= BSS_CHANGED_HT;
+
+	sdata->bss_conf.assoc_ht = 0;
+	sdata->bss_conf.ht_conf = NULL;
+	sdata->bss_conf.ht_bss_conf = NULL;
+
+	ieee80211_led_assoc(local, 0);
+	sdata->bss_conf.assoc = 0;
+
+	ieee80211_sta_send_apinfo(sdata, ifsta);
 
 	if (self_disconnected)
 		ifsta->state = IEEE80211_STA_MLME_DISABLED;
@@ -2081,7 +2091,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	 * ieee80211_set_associated() will tell the driver */
 	bss_conf->aid = aid;
 	bss_conf->assoc_capability = capab_info;
-	ieee80211_set_associated(sdata, ifsta, 1);
+	ieee80211_set_associated(sdata, ifsta);
 
 	ieee80211_associated(sdata, ifsta);
 }
-- 
cgit v1.2.3


From 24e64622c3f3143c801850897ab0cea8f3c69445 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Mon, 8 Sep 2008 17:33:40 +0200
Subject: mac80211: stop queues before carrier off

During testing of the disassociation fixes, Tomas noticed that it
was possible to run into a situation where you'd suddenly get a
few "wlan0: dropped frame to <AP> (unauthorized port)" messages
and I found this to be due to the AP's sta_info having been
removed but netif_carrier_off not having removed/stopped traffic
yet. To avoid that, stop the queue for the interface (and avoid
bringing them up when another vif scans when they weren't up.)

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0e9bd840e7d..d889e2a89e4 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -428,6 +428,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	sdata->bss_conf.assoc = 1;
 	ieee80211_bss_info_change_notify(sdata, changed);
 
+	netif_tx_start_all_queues(sdata->dev);
 	netif_carrier_on(sdata->dev);
 
 	ieee80211_sta_send_apinfo(sdata, ifsta);
@@ -849,6 +850,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	ifsta->assoc_scan_tries = 0;
 	ifsta->assoc_tries = 0;
 
+	netif_tx_stop_all_queues(sdata->dev);
 	netif_carrier_off(sdata->dev);
 
 	ieee80211_sta_tear_down_BA_sessions(sdata, sta->addr);
@@ -3268,6 +3270,7 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
 	ifsta->direct_probe_tries = 0;
 	ifsta->auth_tries = 0;
 	ifsta->assoc_tries = 0;
+	netif_tx_stop_all_queues(sdata->dev);
 	netif_carrier_off(sdata->dev);
 }
 
@@ -3744,13 +3747,15 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		/* Tell AP we're back */
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA &&
-		    sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED)
-			ieee80211_send_nullfunc(local, sdata, 0);
+		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+				ieee80211_send_nullfunc(local, sdata, 0);
+				netif_tx_wake_all_queues(sdata->dev);
+			}
+		} else
+			netif_tx_wake_all_queues(sdata->dev);
 
 		ieee80211_restart_sta_timer(sdata);
-
-		netif_wake_queue(sdata->dev);
 	}
 	rcu_read_unlock();
 
@@ -3908,10 +3913,13 @@ static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		netif_stop_queue(sdata->dev);
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA &&
-		    (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED))
-			ieee80211_send_nullfunc(local, sdata, 1);
+		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+				netif_tx_stop_all_queues(sdata->dev);
+				ieee80211_send_nullfunc(local, sdata, 1);
+			}
+		} else
+			netif_tx_stop_all_queues(sdata->dev);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 60f8b39c9406752ea5e0d3bbf5df6f903d61cacf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:22 +0200
Subject: mac80211: reorder mlme code

This reorders the mlme code a bit so we don't need all the forward
function declarations. It also removes the ERP_INFO_USE_PROTECTION
define that is unused, but otherwise contains no real changes.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 1224 +++++++++++++++++++++++++--------------------------
 1 file changed, 597 insertions(+), 627 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d889e2a89e4..67c38235a3c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -26,9 +26,8 @@
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
 #include <net/iw_handler.h>
-#include <asm/types.h>
-
 #include <net/mac80211.h>
+
 #include "ieee80211_i.h"
 #include "rate.h"
 #include "led.h"
@@ -58,8 +57,6 @@
 #define IEEE80211_IBSS_MAX_STA_ENTRIES 128
 
 
-#define ERP_INFO_USE_PROTECTION BIT(1)
-
 /* mgmt header + 1 byte category code */
 #define IEEE80211_MIN_ACTION_SIZE (24 + 1)
 
@@ -74,24 +71,199 @@
 #define IEEE80211_MIN_AMPDU_BUF 0x8
 #define IEEE80211_MAX_AMPDU_BUF 0x40
 
-static void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
-				     u8 *ssid, size_t ssid_len);
+/* BSS handling */
 static struct ieee80211_sta_bss *
 ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
-		     u8 *ssid, u8 ssid_len);
+		     u8 *ssid, u8 ssid_len)
+{
+	struct ieee80211_sta_bss *bss;
+
+	spin_lock_bh(&local->sta_bss_lock);
+	bss = local->sta_bss_hash[STA_HASH(bssid)];
+	while (bss) {
+		if (!bss_mesh_cfg(bss) &&
+		    !memcmp(bss->bssid, bssid, ETH_ALEN) &&
+		    bss->freq == freq &&
+		    bss->ssid_len == ssid_len &&
+		    (ssid_len == 0 || !memcmp(bss->ssid, ssid, ssid_len))) {
+			atomic_inc(&bss->users);
+			break;
+		}
+		bss = bss->hnext;
+	}
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+
+/* Caller must hold local->sta_bss_lock */
+static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local,
+					struct ieee80211_sta_bss *bss)
+{
+	u8 hash_idx;
+
+	if (bss_mesh_cfg(bss))
+		hash_idx = mesh_id_hash(bss_mesh_id(bss),
+					bss_mesh_id_len(bss));
+	else
+		hash_idx = STA_HASH(bss->bssid);
+
+	bss->hnext = local->sta_bss_hash[hash_idx];
+	local->sta_bss_hash[hash_idx] = bss;
+}
+
+/* Caller must hold local->sta_bss_lock */
+static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
+					struct ieee80211_sta_bss *bss)
+{
+	struct ieee80211_sta_bss *b, *prev = NULL;
+	b = local->sta_bss_hash[STA_HASH(bss->bssid)];
+	while (b) {
+		if (b == bss) {
+			if (!prev)
+				local->sta_bss_hash[STA_HASH(bss->bssid)] =
+					bss->hnext;
+			else
+				prev->hnext = bss->hnext;
+			break;
+		}
+		prev = b;
+		b = b->hnext;
+	}
+}
+
+static struct ieee80211_sta_bss *
+ieee80211_rx_bss_add(struct ieee80211_sub_if_data *sdata, u8 *bssid, int freq,
+		     u8 *ssid, u8 ssid_len)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_sta_bss *bss;
+
+	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
+	if (!bss)
+		return NULL;
+	atomic_inc(&bss->users);
+	atomic_inc(&bss->users);
+	memcpy(bss->bssid, bssid, ETH_ALEN);
+	bss->freq = freq;
+	if (ssid && ssid_len <= IEEE80211_MAX_SSID_LEN) {
+		memcpy(bss->ssid, ssid, ssid_len);
+		bss->ssid_len = ssid_len;
+	}
+
+	spin_lock_bh(&local->sta_bss_lock);
+	/* TODO: order by RSSI? */
+	list_add_tail(&bss->list, &local->sta_bss_list);
+	__ieee80211_rx_bss_hash_add(local, bss);
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+
+#ifdef CONFIG_MAC80211_MESH
+static struct ieee80211_sta_bss *
+ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
+			  u8 *mesh_cfg, int freq)
+{
+	struct ieee80211_sta_bss *bss;
+
+	spin_lock_bh(&local->sta_bss_lock);
+	bss = local->sta_bss_hash[mesh_id_hash(mesh_id, mesh_id_len)];
+	while (bss) {
+		if (bss_mesh_cfg(bss) &&
+		    !memcmp(bss_mesh_cfg(bss), mesh_cfg, MESH_CFG_CMP_LEN) &&
+		    bss->freq == freq &&
+		    mesh_id_len == bss->mesh_id_len &&
+		    (mesh_id_len == 0 || !memcmp(bss->mesh_id, mesh_id,
+						 mesh_id_len))) {
+			atomic_inc(&bss->users);
+			break;
+		}
+		bss = bss->hnext;
+	}
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+
+static struct ieee80211_sta_bss *
+ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
+			  u8 *mesh_cfg, int mesh_config_len, int freq)
+{
+	struct ieee80211_sta_bss *bss;
+
+	if (mesh_config_len != MESH_CFG_LEN)
+		return NULL;
+
+	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
+	if (!bss)
+		return NULL;
+
+	bss->mesh_cfg = kmalloc(MESH_CFG_CMP_LEN, GFP_ATOMIC);
+	if (!bss->mesh_cfg) {
+		kfree(bss);
+		return NULL;
+	}
+
+	if (mesh_id_len && mesh_id_len <= IEEE80211_MAX_MESH_ID_LEN) {
+		bss->mesh_id = kmalloc(mesh_id_len, GFP_ATOMIC);
+		if (!bss->mesh_id) {
+			kfree(bss->mesh_cfg);
+			kfree(bss);
+			return NULL;
+		}
+		memcpy(bss->mesh_id, mesh_id, mesh_id_len);
+	}
+
+	atomic_inc(&bss->users);
+	atomic_inc(&bss->users);
+	memcpy(bss->mesh_cfg, mesh_cfg, MESH_CFG_CMP_LEN);
+	bss->mesh_id_len = mesh_id_len;
+	bss->freq = freq;
+	spin_lock_bh(&local->sta_bss_lock);
+	/* TODO: order by RSSI? */
+	list_add_tail(&bss->list, &local->sta_bss_list);
+	__ieee80211_rx_bss_hash_add(local, bss);
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+#endif
+
+static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
+{
+	kfree(bss->ies);
+	kfree(bss_mesh_id(bss));
+	kfree(bss_mesh_cfg(bss));
+	kfree(bss);
+}
+
 static void ieee80211_rx_bss_put(struct ieee80211_local *local,
-				 struct ieee80211_sta_bss *bss);
-static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta);
-static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata);
-static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *sdata,
-				    u8 *ssid, size_t ssid_len);
-static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta);
-static void sta_rx_agg_session_timer_expired(unsigned long data);
+				 struct ieee80211_sta_bss *bss)
+{
+	local_bh_disable();
+	if (!atomic_dec_and_lock(&bss->users, &local->sta_bss_lock)) {
+		local_bh_enable();
+		return;
+	}
+
+	__ieee80211_rx_bss_hash_del(local, bss);
+	list_del(&bss->list);
+	spin_unlock_bh(&local->sta_bss_lock);
+	ieee80211_rx_bss_free(bss);
+}
 
+void ieee80211_rx_bss_list_init(struct ieee80211_local *local)
+{
+	spin_lock_init(&local->sta_bss_lock);
+	INIT_LIST_HEAD(&local->sta_bss_list);
+}
+
+void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local)
+{
+	struct ieee80211_sta_bss *bss, *tmp;
 
-static u8 * ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
+	list_for_each_entry_safe(bss, tmp, &local->sta_bss_list, list)
+		ieee80211_rx_bss_put(local, bss);
+}
+
+static u8 *ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
 {
 	u8 *end, *pos;
 
@@ -111,13 +283,125 @@ static u8 * ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
 	return NULL;
 }
 
-
+/* utils */
 static int ecw2cw(int ecw)
 {
 	return (1 << ecw) - 1;
 }
 
+/* frame sending functions */
+void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		      int encrypt)
+{
+	skb->dev = sdata->local->mdev;
+	skb_set_mac_header(skb, 0);
+	skb_set_network_header(skb, 0);
+	skb_set_transport_header(skb, 0);
 
+	skb->iif = sdata->dev->ifindex;
+	skb->do_not_encrypt = !encrypt;
+
+	dev_queue_xmit(skb);
+}
+
+static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_if_sta *ifsta,
+				int transaction, u8 *extra, size_t extra_len,
+				int encrypt)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(*mgmt) + 6 + extra_len);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
+	memset(mgmt, 0, 24 + 6);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_AUTH);
+	if (encrypt)
+		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
+	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
+	ifsta->auth_transaction = transaction + 1;
+	mgmt->u.auth.status_code = cpu_to_le16(0);
+	if (extra)
+		memcpy(skb_put(skb, extra_len), extra, extra_len);
+
+	ieee80211_sta_tx(sdata, skb, encrypt);
+}
+
+static void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+				     u8 *ssid, size_t ssid_len)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos, *supp_rates, *esupp_rates = NULL;
+	int i;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
+		       "request\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_PROBE_REQ);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	if (dst) {
+		memcpy(mgmt->da, dst, ETH_ALEN);
+		memcpy(mgmt->bssid, dst, ETH_ALEN);
+	} else {
+		memset(mgmt->da, 0xff, ETH_ALEN);
+		memset(mgmt->bssid, 0xff, ETH_ALEN);
+	}
+	pos = skb_put(skb, 2 + ssid_len);
+	*pos++ = WLAN_EID_SSID;
+	*pos++ = ssid_len;
+	memcpy(pos, ssid, ssid_len);
+
+	supp_rates = skb_put(skb, 2);
+	supp_rates[0] = WLAN_EID_SUPP_RATES;
+	supp_rates[1] = 0;
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	for (i = 0; i < sband->n_bitrates; i++) {
+		struct ieee80211_rate *rate = &sband->bitrates[i];
+		if (esupp_rates) {
+			pos = skb_put(skb, 1);
+			esupp_rates[1]++;
+		} else if (supp_rates[1] == 8) {
+			esupp_rates = skb_put(skb, 3);
+			esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
+			esupp_rates[1] = 1;
+			pos = &esupp_rates[2];
+		} else {
+			pos = skb_put(skb, 1);
+			supp_rates[1]++;
+		}
+		*pos = rate->bitrate / 5;
+	}
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+/* MLME */
 static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_sta_bss *bss,
 					 int ibss)
@@ -434,58 +718,6 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	ieee80211_sta_send_apinfo(sdata, ifsta);
 }
 
-void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-		      int encrypt)
-{
-	skb->dev = sdata->local->mdev;
-	skb_set_mac_header(skb, 0);
-	skb_set_network_header(skb, 0);
-	skb_set_transport_header(skb, 0);
-
-	skb->iif = sdata->dev->ifindex;
-	skb->do_not_encrypt = !encrypt;
-
-	dev_queue_xmit(skb);
-}
-
-
-static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
-				struct ieee80211_if_sta *ifsta,
-				int transaction, u8 *extra, size_t extra_len,
-				int encrypt)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    sizeof(*mgmt) + 6 + extra_len);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
-		       "frame\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
-	memset(mgmt, 0, 24 + 6);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_AUTH);
-	if (encrypt)
-		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
-	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
-	ifsta->auth_transaction = transaction + 1;
-	mgmt->u.auth.status_code = cpu_to_le16(0);
-	if (extra)
-		memcpy(skb_put(skb, extra_len), extra, extra_len);
-
-	ieee80211_sta_tx(sdata, skb, encrypt);
-}
-
 static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta)
 {
@@ -798,6 +1030,13 @@ static void ieee80211_send_deauth(struct ieee80211_sub_if_data *sdata,
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
+static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
+{
+	if (!sdata || !sdata->default_key ||
+	    sdata->default_key->conf.alg != ALG_WEP)
+		return 0;
+	return 1;
+}
 
 static void ieee80211_send_disassoc(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_if_sta *ifsta, u16 reason)
@@ -917,7 +1156,6 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
 	return 1;
 }
 
-
 static void ieee80211_associate(struct ieee80211_sub_if_data *sdata,
 				struct ieee80211_if_sta *ifsta)
 {
@@ -999,82 +1237,12 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 
 	rcu_read_unlock();
 
-	if (disassoc)
-		ieee80211_set_disassoc(sdata, ifsta, true, true,
-					WLAN_REASON_PREV_AUTH_NOT_VALID);
-	else
-		mod_timer(&ifsta->timer, jiffies +
-				      IEEE80211_MONITORING_INTERVAL);
-}
-
-
-static void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
-				     u8 *ssid, size_t ssid_len)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_supported_band *sband;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *supp_rates, *esupp_rates = NULL;
-	int i;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
-		       "request\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_PROBE_REQ);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (dst) {
-		memcpy(mgmt->da, dst, ETH_ALEN);
-		memcpy(mgmt->bssid, dst, ETH_ALEN);
-	} else {
-		memset(mgmt->da, 0xff, ETH_ALEN);
-		memset(mgmt->bssid, 0xff, ETH_ALEN);
-	}
-	pos = skb_put(skb, 2 + ssid_len);
-	*pos++ = WLAN_EID_SSID;
-	*pos++ = ssid_len;
-	memcpy(pos, ssid, ssid_len);
-
-	supp_rates = skb_put(skb, 2);
-	supp_rates[0] = WLAN_EID_SUPP_RATES;
-	supp_rates[1] = 0;
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
-	for (i = 0; i < sband->n_bitrates; i++) {
-		struct ieee80211_rate *rate = &sband->bitrates[i];
-		if (esupp_rates) {
-			pos = skb_put(skb, 1);
-			esupp_rates[1]++;
-		} else if (supp_rates[1] == 8) {
-			esupp_rates = skb_put(skb, 3);
-			esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
-			esupp_rates[1] = 1;
-			pos = &esupp_rates[2];
-		} else {
-			pos = skb_put(skb, 1);
-			supp_rates[1]++;
-		}
-		*pos = rate->bitrate / 5;
-	}
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
-
-static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
-{
-	if (!sdata || !sdata->default_key ||
-	    sdata->default_key->conf.alg != ALG_WEP)
-		return 0;
-	return 1;
+	if (disassoc)
+		ieee80211_set_disassoc(sdata, ifsta, true, true,
+					WLAN_REASON_PREV_AUTH_NOT_VALID);
+	else
+		mod_timer(&ifsta->timer, jiffies +
+				      IEEE80211_MONITORING_INTERVAL);
 }
 
 
@@ -1200,6 +1368,30 @@ void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
+/*
+ * After accepting the AddBA Request we activated a timer,
+ * resetting it after each frame that arrives from the originator.
+ * if this timer expires ieee80211_sta_stop_rx_ba_session will be executed.
+ */
+static void sta_rx_agg_session_timer_expired(unsigned long data)
+{
+	/* not an elegant detour, but there is no choice as the timer passes
+	 * only one argument, and various sta_info are needed here, so init
+	 * flow in sta_info_create gives the TID as data, while the timer_to_id
+	 * array gives the sta through container_of */
+	u8 *ptid = (u8 *)data;
+	u8 *timer_to_id = ptid - *ptid;
+	struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+					 timer_to_tid[0]);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
+#endif
+	ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
+					 (u16)*ptid, WLAN_BACK_TIMER,
+					 WLAN_REASON_QSTA_TIMEOUT);
+}
+
 static void ieee80211_sta_process_addba_request(struct ieee80211_local *local,
 						struct ieee80211_mgmt *mgmt,
 						size_t len)
@@ -1646,30 +1838,6 @@ timer_expired_exit:
 	rcu_read_unlock();
 }
 
-/*
- * After accepting the AddBA Request we activated a timer,
- * resetting it after each frame that arrives from the originator.
- * if this timer expires ieee80211_sta_stop_rx_ba_session will be executed.
- */
-static void sta_rx_agg_session_timer_expired(unsigned long data)
-{
-	/* not an elegant detour, but there is no choice as the timer passes
-	 * only one argument, and various sta_info are needed here, so init
-	 * flow in sta_info_create gives the TID as data, while the timer_to_id
-	 * array gives the sta through container_of */
-	u8 *ptid = (u8 *)data;
-	u8 *timer_to_id = ptid - *ptid;
-	struct sta_info *sta = container_of(timer_to_id, struct sta_info,
-					 timer_to_tid[0]);
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
-#endif
-	ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
-					 (u16)*ptid, WLAN_BACK_TIMER,
-					 WLAN_REASON_QSTA_TIMEOUT);
-}
-
 void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr)
 {
 	struct ieee80211_local *local = sdata->local;
@@ -1991,308 +2159,111 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 			rcu_read_unlock();
 			return;
 		}
-		bss = ieee80211_rx_bss_get(local, ifsta->bssid,
-					   local->hw.conf.channel->center_freq,
-					   ifsta->ssid, ifsta->ssid_len);
-		if (bss) {
-			sta->last_signal = bss->signal;
-			sta->last_qual = bss->qual;
-			sta->last_noise = bss->noise;
-			ieee80211_rx_bss_put(local, bss);
-		}
-
-		err = sta_info_insert(sta);
-		if (err) {
-			printk(KERN_DEBUG "%s: failed to insert STA entry for"
-			       " the AP (error %d)\n", sdata->dev->name, err);
-			rcu_read_unlock();
-			return;
-		}
-		/* update new sta with its last rx activity */
-		sta->last_rx = jiffies;
-	}
-
-	/*
-	 * FIXME: Do we really need to update the sta_info's information here?
-	 *	  We already know about the AP (we found it in our list) so it
-	 *	  should already be filled with the right info, no?
-	 *	  As is stands, all this is racy because typically we assume
-	 *	  the information that is filled in here (except flags) doesn't
-	 *	  change while a STA structure is alive. As such, it should move
-	 *	  to between the sta_info_alloc() and sta_info_insert() above.
-	 */
-
-	set_sta_flags(sta, WLAN_STA_AUTH | WLAN_STA_ASSOC | WLAN_STA_ASSOC_AP |
-			   WLAN_STA_AUTHORIZED);
-
-	rates = 0;
-	basic_rates = 0;
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
-	for (i = 0; i < elems.supp_rates_len; i++) {
-		int rate = (elems.supp_rates[i] & 0x7f) * 5;
-
-		if (rate > 110)
-			have_higher_than_11mbit = true;
-
-		for (j = 0; j < sband->n_bitrates; j++) {
-			if (sband->bitrates[j].bitrate == rate)
-				rates |= BIT(j);
-			if (elems.supp_rates[i] & 0x80)
-				basic_rates |= BIT(j);
-		}
-	}
-
-	for (i = 0; i < elems.ext_supp_rates_len; i++) {
-		int rate = (elems.ext_supp_rates[i] & 0x7f) * 5;
-
-		if (rate > 110)
-			have_higher_than_11mbit = true;
-
-		for (j = 0; j < sband->n_bitrates; j++) {
-			if (sband->bitrates[j].bitrate == rate)
-				rates |= BIT(j);
-			if (elems.ext_supp_rates[i] & 0x80)
-				basic_rates |= BIT(j);
-		}
-	}
-
-	sta->supp_rates[local->hw.conf.channel->band] = rates;
-	sdata->basic_rates = basic_rates;
-
-	/* cf. IEEE 802.11 9.2.12 */
-	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
-	    have_higher_than_11mbit)
-		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
-	else
-		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
-
-	if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param &&
-	    (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
-		struct ieee80211_ht_bss_info bss_info;
-		ieee80211_ht_cap_ie_to_ht_info(
-				(struct ieee80211_ht_cap *)
-				elems.ht_cap_elem, &sta->ht_info);
-		ieee80211_ht_addt_info_ie_to_ht_bss_info(
-				(struct ieee80211_ht_addt_info *)
-				elems.ht_info_elem, &bss_info);
-		ieee80211_handle_ht(local, 1, &sta->ht_info, &bss_info);
-	}
-
-	rate_control_rate_init(sta, local);
-
-	if (elems.wmm_param) {
-		set_sta_flags(sta, WLAN_STA_WME);
-		rcu_read_unlock();
-		ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
-					 elems.wmm_param_len);
-	} else
-		rcu_read_unlock();
-
-	/* set AID and assoc capability,
-	 * ieee80211_set_associated() will tell the driver */
-	bss_conf->aid = aid;
-	bss_conf->assoc_capability = capab_info;
-	ieee80211_set_associated(sdata, ifsta);
-
-	ieee80211_associated(sdata, ifsta);
-}
-
-
-/* Caller must hold local->sta_bss_lock */
-static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local,
-					struct ieee80211_sta_bss *bss)
-{
-	u8 hash_idx;
-
-	if (bss_mesh_cfg(bss))
-		hash_idx = mesh_id_hash(bss_mesh_id(bss),
-					bss_mesh_id_len(bss));
-	else
-		hash_idx = STA_HASH(bss->bssid);
-
-	bss->hnext = local->sta_bss_hash[hash_idx];
-	local->sta_bss_hash[hash_idx] = bss;
-}
-
-
-/* Caller must hold local->sta_bss_lock */
-static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
-					struct ieee80211_sta_bss *bss)
-{
-	struct ieee80211_sta_bss *b, *prev = NULL;
-	b = local->sta_bss_hash[STA_HASH(bss->bssid)];
-	while (b) {
-		if (b == bss) {
-			if (!prev)
-				local->sta_bss_hash[STA_HASH(bss->bssid)] =
-					bss->hnext;
-			else
-				prev->hnext = bss->hnext;
-			break;
-		}
-		prev = b;
-		b = b->hnext;
-	}
-}
-
-
-static struct ieee80211_sta_bss *
-ieee80211_rx_bss_add(struct ieee80211_sub_if_data *sdata, u8 *bssid, int freq,
-		     u8 *ssid, u8 ssid_len)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_sta_bss *bss;
-
-	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
-	if (!bss)
-		return NULL;
-	atomic_inc(&bss->users);
-	atomic_inc(&bss->users);
-	memcpy(bss->bssid, bssid, ETH_ALEN);
-	bss->freq = freq;
-	if (ssid && ssid_len <= IEEE80211_MAX_SSID_LEN) {
-		memcpy(bss->ssid, ssid, ssid_len);
-		bss->ssid_len = ssid_len;
-	}
-
-	spin_lock_bh(&local->sta_bss_lock);
-	/* TODO: order by RSSI? */
-	list_add_tail(&bss->list, &local->sta_bss_list);
-	__ieee80211_rx_bss_hash_add(local, bss);
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
-
-static struct ieee80211_sta_bss *
-ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
-		     u8 *ssid, u8 ssid_len)
-{
-	struct ieee80211_sta_bss *bss;
-
-	spin_lock_bh(&local->sta_bss_lock);
-	bss = local->sta_bss_hash[STA_HASH(bssid)];
-	while (bss) {
-		if (!bss_mesh_cfg(bss) &&
-		    !memcmp(bss->bssid, bssid, ETH_ALEN) &&
-		    bss->freq == freq &&
-		    bss->ssid_len == ssid_len &&
-		    (ssid_len == 0 || !memcmp(bss->ssid, ssid, ssid_len))) {
-			atomic_inc(&bss->users);
-			break;
-		}
-		bss = bss->hnext;
-	}
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
-
-#ifdef CONFIG_MAC80211_MESH
-static struct ieee80211_sta_bss *
-ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
-			  u8 *mesh_cfg, int freq)
-{
-	struct ieee80211_sta_bss *bss;
+		bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+					   local->hw.conf.channel->center_freq,
+					   ifsta->ssid, ifsta->ssid_len);
+		if (bss) {
+			sta->last_signal = bss->signal;
+			sta->last_qual = bss->qual;
+			sta->last_noise = bss->noise;
+			ieee80211_rx_bss_put(local, bss);
+		}
 
-	spin_lock_bh(&local->sta_bss_lock);
-	bss = local->sta_bss_hash[mesh_id_hash(mesh_id, mesh_id_len)];
-	while (bss) {
-		if (bss_mesh_cfg(bss) &&
-		    !memcmp(bss_mesh_cfg(bss), mesh_cfg, MESH_CFG_CMP_LEN) &&
-		    bss->freq == freq &&
-		    mesh_id_len == bss->mesh_id_len &&
-		    (mesh_id_len == 0 || !memcmp(bss->mesh_id, mesh_id,
-						 mesh_id_len))) {
-			atomic_inc(&bss->users);
-			break;
+		err = sta_info_insert(sta);
+		if (err) {
+			printk(KERN_DEBUG "%s: failed to insert STA entry for"
+			       " the AP (error %d)\n", sdata->dev->name, err);
+			rcu_read_unlock();
+			return;
 		}
-		bss = bss->hnext;
+		/* update new sta with its last rx activity */
+		sta->last_rx = jiffies;
 	}
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
 
-static struct ieee80211_sta_bss *
-ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
-			  u8 *mesh_cfg, int mesh_config_len, int freq)
-{
-	struct ieee80211_sta_bss *bss;
+	/*
+	 * FIXME: Do we really need to update the sta_info's information here?
+	 *	  We already know about the AP (we found it in our list) so it
+	 *	  should already be filled with the right info, no?
+	 *	  As is stands, all this is racy because typically we assume
+	 *	  the information that is filled in here (except flags) doesn't
+	 *	  change while a STA structure is alive. As such, it should move
+	 *	  to between the sta_info_alloc() and sta_info_insert() above.
+	 */
 
-	if (mesh_config_len != MESH_CFG_LEN)
-		return NULL;
+	set_sta_flags(sta, WLAN_STA_AUTH | WLAN_STA_ASSOC | WLAN_STA_ASSOC_AP |
+			   WLAN_STA_AUTHORIZED);
 
-	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
-	if (!bss)
-		return NULL;
+	rates = 0;
+	basic_rates = 0;
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 
-	bss->mesh_cfg = kmalloc(MESH_CFG_CMP_LEN, GFP_ATOMIC);
-	if (!bss->mesh_cfg) {
-		kfree(bss);
-		return NULL;
-	}
+	for (i = 0; i < elems.supp_rates_len; i++) {
+		int rate = (elems.supp_rates[i] & 0x7f) * 5;
 
-	if (mesh_id_len && mesh_id_len <= IEEE80211_MAX_MESH_ID_LEN) {
-		bss->mesh_id = kmalloc(mesh_id_len, GFP_ATOMIC);
-		if (!bss->mesh_id) {
-			kfree(bss->mesh_cfg);
-			kfree(bss);
-			return NULL;
+		if (rate > 110)
+			have_higher_than_11mbit = true;
+
+		for (j = 0; j < sband->n_bitrates; j++) {
+			if (sband->bitrates[j].bitrate == rate)
+				rates |= BIT(j);
+			if (elems.supp_rates[i] & 0x80)
+				basic_rates |= BIT(j);
 		}
-		memcpy(bss->mesh_id, mesh_id, mesh_id_len);
 	}
 
-	atomic_inc(&bss->users);
-	atomic_inc(&bss->users);
-	memcpy(bss->mesh_cfg, mesh_cfg, MESH_CFG_CMP_LEN);
-	bss->mesh_id_len = mesh_id_len;
-	bss->freq = freq;
-	spin_lock_bh(&local->sta_bss_lock);
-	/* TODO: order by RSSI? */
-	list_add_tail(&bss->list, &local->sta_bss_list);
-	__ieee80211_rx_bss_hash_add(local, bss);
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
-#endif
-
-static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
-{
-	kfree(bss->ies);
-	kfree(bss_mesh_id(bss));
-	kfree(bss_mesh_cfg(bss));
-	kfree(bss);
-}
+	for (i = 0; i < elems.ext_supp_rates_len; i++) {
+		int rate = (elems.ext_supp_rates[i] & 0x7f) * 5;
 
+		if (rate > 110)
+			have_higher_than_11mbit = true;
 
-static void ieee80211_rx_bss_put(struct ieee80211_local *local,
-				 struct ieee80211_sta_bss *bss)
-{
-	local_bh_disable();
-	if (!atomic_dec_and_lock(&bss->users, &local->sta_bss_lock)) {
-		local_bh_enable();
-		return;
+		for (j = 0; j < sband->n_bitrates; j++) {
+			if (sband->bitrates[j].bitrate == rate)
+				rates |= BIT(j);
+			if (elems.ext_supp_rates[i] & 0x80)
+				basic_rates |= BIT(j);
+		}
 	}
 
-	__ieee80211_rx_bss_hash_del(local, bss);
-	list_del(&bss->list);
-	spin_unlock_bh(&local->sta_bss_lock);
-	ieee80211_rx_bss_free(bss);
-}
+	sta->supp_rates[local->hw.conf.channel->band] = rates;
+	sdata->basic_rates = basic_rates;
+
+	/* cf. IEEE 802.11 9.2.12 */
+	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+	    have_higher_than_11mbit)
+		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+	else
+		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
 
+	if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param &&
+	    (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
+		struct ieee80211_ht_bss_info bss_info;
+		ieee80211_ht_cap_ie_to_ht_info(
+				(struct ieee80211_ht_cap *)
+				elems.ht_cap_elem, &sta->ht_info);
+		ieee80211_ht_addt_info_ie_to_ht_bss_info(
+				(struct ieee80211_ht_addt_info *)
+				elems.ht_info_elem, &bss_info);
+		ieee80211_handle_ht(local, 1, &sta->ht_info, &bss_info);
+	}
 
-void ieee80211_rx_bss_list_init(struct ieee80211_local *local)
-{
-	spin_lock_init(&local->sta_bss_lock);
-	INIT_LIST_HEAD(&local->sta_bss_list);
-}
+	rate_control_rate_init(sta, local);
 
+	if (elems.wmm_param) {
+		set_sta_flags(sta, WLAN_STA_WME);
+		rcu_read_unlock();
+		ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
+					 elems.wmm_param_len);
+	} else
+		rcu_read_unlock();
 
-void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local)
-{
-	struct ieee80211_sta_bss *bss, *tmp;
+	/* set AID and assoc capability,
+	 * ieee80211_set_associated() will tell the driver */
+	bss_conf->aid = aid;
+	bss_conf->assoc_capability = capab_info;
+	ieee80211_set_associated(sdata, ifsta);
 
-	list_for_each_entry_safe(bss, tmp, &local->sta_bss_list, list)
-		ieee80211_rx_bss_put(local, bss);
+	ieee80211_associated(sdata, ifsta);
 }
 
 
@@ -3145,104 +3116,15 @@ void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 
 void ieee80211_sta_timer(unsigned long data)
 {
-	struct ieee80211_sub_if_data *sdata =
-		(struct ieee80211_sub_if_data *) data;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct ieee80211_local *local = sdata->local;
-
-	set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
-	queue_work(local->hw.workqueue, &ifsta->work);
-}
-
-void ieee80211_sta_work(struct work_struct *work)
-{
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data, u.sta.work);
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta;
-	struct sk_buff *skb;
-
-	if (!netif_running(sdata->dev))
-		return;
-
-	if (local->sta_sw_scanning || local->sta_hw_scanning)
-		return;
-
-	if (WARN_ON(sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-		    sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
-		    sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT))
-		return;
-	ifsta = &sdata->u.sta;
-
-	while ((skb = skb_dequeue(&ifsta->skb_queue)))
-		ieee80211_sta_rx_queued_mgmt(sdata, skb);
-
-#ifdef CONFIG_MAC80211_MESH
-	if (ifsta->preq_queue_len &&
-	    time_after(jiffies,
-		       ifsta->last_preq + msecs_to_jiffies(ifsta->mshcfg.dot11MeshHWMPpreqMinInterval)))
-		mesh_path_start_discovery(sdata);
-#endif
-
-	if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
-	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
-	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
-	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
-		if (ifsta->scan_ssid_len)
-			ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
-		else
-			ieee80211_sta_start_scan(sdata, NULL, 0);
-		return;
-	}
-
-	if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) {
-		if (ieee80211_sta_config_auth(sdata, ifsta))
-			return;
-		clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
-	} else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request))
-		return;
-
-	switch (ifsta->state) {
-	case IEEE80211_STA_MLME_DISABLED:
-		break;
-	case IEEE80211_STA_MLME_DIRECT_PROBE:
-		ieee80211_direct_probe(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_AUTHENTICATE:
-		ieee80211_authenticate(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_ASSOCIATE:
-		ieee80211_associate(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_ASSOCIATED:
-		ieee80211_associated(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_IBSS_SEARCH:
-		ieee80211_sta_find_ibss(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_IBSS_JOINED:
-		ieee80211_sta_merge_ibss(sdata, ifsta);
-		break;
-#ifdef CONFIG_MAC80211_MESH
-	case IEEE80211_STA_MLME_MESH_UP:
-		ieee80211_mesh_housekeeping(sdata, ifsta);
-		break;
-#endif
-	default:
-		WARN_ON(1);
-		break;
-	}
-
-	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
-		printk(KERN_DEBUG "%s: privacy configuration mismatch and "
-		       "mixed-cell disabled - disassociate\n", sdata->dev->name);
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_local *local = sdata->local;
 
-		ieee80211_set_disassoc(sdata, ifsta, false, true,
-					WLAN_REASON_UNSPECIFIED);
-	}
+	set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
+	queue_work(local->hw.workqueue, &ifsta->work);
 }
 
-
 static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
@@ -3327,85 +3209,6 @@ static int ieee80211_sta_match_ssid(struct ieee80211_if_sta *ifsta,
 	return 0;
 }
 
-static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_sta_bss *bss, *selected = NULL;
-	int top_rssi = 0, freq;
-
-	spin_lock_bh(&local->sta_bss_lock);
-	freq = local->oper_channel->center_freq;
-	list_for_each_entry(bss, &local->sta_bss_list, list) {
-		if (!(bss->capability & WLAN_CAPABILITY_ESS))
-			continue;
-
-		if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
-			IEEE80211_STA_AUTO_BSSID_SEL |
-			IEEE80211_STA_AUTO_CHANNEL_SEL)) &&
-		    (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
-		     !!sdata->default_key))
-			continue;
-
-		if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) &&
-		    bss->freq != freq)
-			continue;
-
-		if (!(ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) &&
-		    memcmp(bss->bssid, ifsta->bssid, ETH_ALEN))
-			continue;
-
-		if (!(ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) &&
-		    !ieee80211_sta_match_ssid(ifsta, bss->ssid, bss->ssid_len))
-			continue;
-
-		if (!selected || top_rssi < bss->signal) {
-			selected = bss;
-			top_rssi = bss->signal;
-		}
-	}
-	if (selected)
-		atomic_inc(&selected->users);
-	spin_unlock_bh(&local->sta_bss_lock);
-
-	if (selected) {
-		ieee80211_set_freq(sdata, selected->freq);
-		if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
-			ieee80211_sta_set_ssid(sdata, selected->ssid,
-					       selected->ssid_len);
-		ieee80211_sta_set_bssid(sdata, selected->bssid);
-		ieee80211_sta_def_wmm_params(sdata, selected, 0);
-
-		/* Send out direct probe if no probe resp was received or
-		 * the one we have is outdated
-		 */
-		if (!selected->last_probe_resp ||
-		    time_after(jiffies, selected->last_probe_resp
-					+ IEEE80211_SCAN_RESULT_EXPIRE))
-			ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
-		else
-			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
-
-		ieee80211_rx_bss_put(local, selected);
-		ieee80211_sta_reset_auth(sdata, ifsta);
-		return 0;
-	} else {
-		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
-			ifsta->assoc_scan_tries++;
-			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
-				ieee80211_sta_start_scan(sdata, NULL, 0);
-			else
-				ieee80211_sta_start_scan(sdata, ifsta->ssid,
-							 ifsta->ssid_len);
-			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
-			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
-		} else
-			ifsta->state = IEEE80211_STA_MLME_DISABLED;
-	}
-	return -1;
-}
-
-
 static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
@@ -4273,6 +4076,85 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 }
 
 
+static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_if_sta *ifsta)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_sta_bss *bss, *selected = NULL;
+	int top_rssi = 0, freq;
+
+	spin_lock_bh(&local->sta_bss_lock);
+	freq = local->oper_channel->center_freq;
+	list_for_each_entry(bss, &local->sta_bss_list, list) {
+		if (!(bss->capability & WLAN_CAPABILITY_ESS))
+			continue;
+
+		if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
+			IEEE80211_STA_AUTO_BSSID_SEL |
+			IEEE80211_STA_AUTO_CHANNEL_SEL)) &&
+		    (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
+		     !!sdata->default_key))
+			continue;
+
+		if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) &&
+		    bss->freq != freq)
+			continue;
+
+		if (!(ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) &&
+		    memcmp(bss->bssid, ifsta->bssid, ETH_ALEN))
+			continue;
+
+		if (!(ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) &&
+		    !ieee80211_sta_match_ssid(ifsta, bss->ssid, bss->ssid_len))
+			continue;
+
+		if (!selected || top_rssi < bss->signal) {
+			selected = bss;
+			top_rssi = bss->signal;
+		}
+	}
+	if (selected)
+		atomic_inc(&selected->users);
+	spin_unlock_bh(&local->sta_bss_lock);
+
+	if (selected) {
+		ieee80211_set_freq(sdata, selected->freq);
+		if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
+			ieee80211_sta_set_ssid(sdata, selected->ssid,
+					       selected->ssid_len);
+		ieee80211_sta_set_bssid(sdata, selected->bssid);
+		ieee80211_sta_def_wmm_params(sdata, selected, 0);
+
+		/* Send out direct probe if no probe resp was received or
+		 * the one we have is outdated
+		 */
+		if (!selected->last_probe_resp ||
+		    time_after(jiffies, selected->last_probe_resp
+					+ IEEE80211_SCAN_RESULT_EXPIRE))
+			ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+		else
+			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+
+		ieee80211_rx_bss_put(local, selected);
+		ieee80211_sta_reset_auth(sdata, ifsta);
+		return 0;
+	} else {
+		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
+			ifsta->assoc_scan_tries++;
+			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
+				ieee80211_sta_start_scan(sdata, NULL, 0);
+			else
+				ieee80211_sta_start_scan(sdata, ifsta->ssid,
+							 ifsta->ssid_len);
+			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
+		} else
+			ifsta->state = IEEE80211_STA_MLME_DISABLED;
+	}
+	return -1;
+}
+
+
 int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason)
 {
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
@@ -4326,3 +4208,91 @@ void ieee80211_notify_mac(struct ieee80211_hw *hw,
 	}
 }
 EXPORT_SYMBOL(ieee80211_notify_mac);
+
+void ieee80211_sta_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data, u.sta.work);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_sta *ifsta;
+	struct sk_buff *skb;
+
+	if (!netif_running(sdata->dev))
+		return;
+
+	if (local->sta_sw_scanning || local->sta_hw_scanning)
+		return;
+
+	if (WARN_ON(sdata->vif.type != IEEE80211_IF_TYPE_STA &&
+		    sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
+		    sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT))
+		return;
+	ifsta = &sdata->u.sta;
+
+	while ((skb = skb_dequeue(&ifsta->skb_queue)))
+		ieee80211_sta_rx_queued_mgmt(sdata, skb);
+
+#ifdef CONFIG_MAC80211_MESH
+	if (ifsta->preq_queue_len &&
+	    time_after(jiffies,
+		       ifsta->last_preq + msecs_to_jiffies(ifsta->mshcfg.dot11MeshHWMPpreqMinInterval)))
+		mesh_path_start_discovery(sdata);
+#endif
+
+	if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
+	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
+	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
+	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
+		if (ifsta->scan_ssid_len)
+			ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
+		else
+			ieee80211_sta_start_scan(sdata, NULL, 0);
+		return;
+	}
+
+	if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) {
+		if (ieee80211_sta_config_auth(sdata, ifsta))
+			return;
+		clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
+	} else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request))
+		return;
+
+	switch (ifsta->state) {
+	case IEEE80211_STA_MLME_DISABLED:
+		break;
+	case IEEE80211_STA_MLME_DIRECT_PROBE:
+		ieee80211_direct_probe(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_AUTHENTICATE:
+		ieee80211_authenticate(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_ASSOCIATE:
+		ieee80211_associate(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_ASSOCIATED:
+		ieee80211_associated(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_IBSS_SEARCH:
+		ieee80211_sta_find_ibss(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_IBSS_JOINED:
+		ieee80211_sta_merge_ibss(sdata, ifsta);
+		break;
+#ifdef CONFIG_MAC80211_MESH
+	case IEEE80211_STA_MLME_MESH_UP:
+		ieee80211_mesh_housekeeping(sdata, ifsta);
+		break;
+#endif
+	default:
+		WARN_ON(1);
+		break;
+	}
+
+	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
+		printk(KERN_DEBUG "%s: privacy configuration mismatch and "
+		       "mixed-cell disabled - disassociate\n", sdata->dev->name);
+
+		ieee80211_set_disassoc(sdata, ifsta, false, true,
+					WLAN_REASON_UNSPECIFIED);
+	}
+}
-- 
cgit v1.2.3


From 491775a50787b9fbb09b5735be3d111c65935f5c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:23 +0200
Subject: mac80211: use sdata pointer for scan interface

Since we now use sdata pointers most of the time, using a netdev
pointer here is somewhat artificial, use an sdata pointer instead.
Replace a netdev-prefix in a few messages by a wiphy-prefix.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/main.c        |  2 +-
 net/mac80211/mlme.c        | 24 +++++++++++-------------
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c68d4df1119..a33bbd1ca2b 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -636,7 +636,7 @@ struct ieee80211_local {
 	enum { SCAN_SET_CHANNEL, SCAN_SEND_PROBE } scan_state;
 	unsigned long last_scan_completed;
 	struct delayed_work scan_work;
-	struct net_device *scan_dev;
+	struct ieee80211_sub_if_data *scan_sdata;
 	struct ieee80211_channel *oper_channel, *scan_channel;
 	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
 	size_t scan_ssid_len;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 7dc06319725..6df4a2e1509 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -551,7 +551,7 @@ static int ieee80211_stop(struct net_device *dev)
 		synchronize_rcu();
 		skb_queue_purge(&sdata->u.sta.skb_queue);
 
-		if (local->scan_dev == sdata->dev) {
+		if (local->scan_sdata == sdata) {
 			if (!local->ops->hw_scan) {
 				local->sta_sw_scanning = 0;
 				cancel_delayed_work(&local->scan_work);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 67c38235a3c..f60212b7500 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3507,19 +3507,18 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
 void ieee80211_scan_completed(struct ieee80211_hw *hw)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
-	struct net_device *dev = local->scan_dev;
 	struct ieee80211_sub_if_data *sdata;
 	union iwreq_data wrqu;
 
 	local->last_scan_completed = jiffies;
 	memset(&wrqu, 0, sizeof(wrqu));
-	wireless_send_event(dev, SIOCGIWSCAN, &wrqu, NULL);
+	wireless_send_event(local->scan_sdata->dev, SIOCGIWSCAN, &wrqu, NULL);
 
 	if (local->sta_hw_scanning) {
 		local->sta_hw_scanning = 0;
 		if (ieee80211_hw_config(local))
 			printk(KERN_DEBUG "%s: failed to restore operational "
-			       "channel after scan\n", dev->name);
+			       "channel after scan\n", wiphy_name(local->hw.wiphy));
 		/* Restart STA timer for HW scan case */
 		rcu_read_lock();
 		list_for_each_entry_rcu(sdata, &local->interfaces, list)
@@ -3532,7 +3531,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 	local->sta_sw_scanning = 0;
 	if (ieee80211_hw_config(local))
 		printk(KERN_DEBUG "%s: failed to restore operational "
-		       "channel after scan\n", dev->name);
+		       "channel after scan\n", wiphy_name(local->hw.wiphy));
 
 
 	netif_tx_lock_bh(local->mdev);
@@ -3562,8 +3561,8 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 	}
 	rcu_read_unlock();
 
-done:
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ done:
+	sdata = local->scan_sdata;
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
 		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
@@ -3578,8 +3577,7 @@ void ieee80211_sta_scan_work(struct work_struct *work)
 {
 	struct ieee80211_local *local =
 		container_of(work, struct ieee80211_local, scan_work.work);
-	struct net_device *dev = local->scan_dev;
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_channel *chan;
 	int skip;
@@ -3627,7 +3625,7 @@ void ieee80211_sta_scan_work(struct work_struct *work)
 			local->scan_channel = chan;
 			if (ieee80211_hw_config(local)) {
 				printk(KERN_DEBUG "%s: failed to set freq to "
-				       "%d MHz for scan\n", dev->name,
+				       "%d MHz for scan\n", wiphy_name(local->hw.wiphy),
 				       chan->center_freq);
 				skip = 1;
 			}
@@ -3697,7 +3695,7 @@ static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 	 */
 
 	if (local->sta_sw_scanning || local->sta_hw_scanning) {
-		if (local->scan_dev == scan_sdata->dev)
+		if (local->scan_sdata == scan_sdata)
 			return 0;
 		return -EBUSY;
 	}
@@ -3707,7 +3705,7 @@ static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 					     ssid, ssid_len);
 		if (!rc) {
 			local->sta_hw_scanning = 1;
-			local->scan_dev = scan_sdata->dev;
+			local->scan_sdata = scan_sdata;
 		}
 		return rc;
 	}
@@ -3734,7 +3732,7 @@ static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 	local->scan_state = SCAN_SET_CHANNEL;
 	local->scan_channel_idx = 0;
 	local->scan_band = IEEE80211_BAND_2GHZ;
-	local->scan_dev = scan_sdata->dev;
+	local->scan_sdata = scan_sdata;
 
 	netif_addr_lock_bh(local->mdev);
 	local->filter_flags |= FIF_BCN_PRBRESP_PROMISC;
@@ -3762,7 +3760,7 @@ int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t
 		return ieee80211_sta_start_scan(sdata, ssid, ssid_len);
 
 	if (local->sta_sw_scanning || local->sta_hw_scanning) {
-		if (local->scan_dev == sdata->dev)
+		if (local->scan_sdata == sdata)
 			return 0;
 		return -EBUSY;
 	}
-- 
cgit v1.2.3


From ee96d6ef82cc29421569b7cb7f7c7ee90168ec50 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:24 +0200
Subject: mac80211: remove useless non-NULL tests from scan results code

I'm surprised nobody complained about these before. What a waste.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f60212b7500..ba502ce132d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3822,6 +3822,7 @@ ieee80211_sta_scan_result(struct ieee80211_local *local,
 			  char *current_ev, char *end_buf)
 {
 	struct iw_event iwe;
+	char *buf;
 
 	if (time_after(jiffies,
 		       bss->last_update + IEEE80211_SCAN_RESULT_EXPIRE))
@@ -3896,7 +3897,7 @@ ieee80211_sta_scan_result(struct ieee80211_local *local,
 
 	ieee80211_sta_add_scan_ies(info, bss, &current_ev, end_buf);
 
-	if (bss && bss->supp_rates_len > 0) {
+	if (bss->supp_rates_len > 0) {
 		/* display all supported rates in readable format */
 		char *p = current_ev + iwe_stream_lcp_len(info);
 		int i;
@@ -3915,30 +3916,25 @@ ieee80211_sta_scan_result(struct ieee80211_local *local,
 		current_ev = p;
 	}
 
-	if (bss) {
-		char *buf;
-		buf = kmalloc(30, GFP_ATOMIC);
-		if (buf) {
-			memset(&iwe, 0, sizeof(iwe));
-			iwe.cmd = IWEVCUSTOM;
-			sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->timestamp));
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			memset(&iwe, 0, sizeof(iwe));
-			iwe.cmd = IWEVCUSTOM;
-			sprintf(buf, " Last beacon: %dms ago",
-				jiffies_to_msecs(jiffies - bss->last_update));
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf, &iwe, buf);
-			kfree(buf);
-		}
+	buf = kmalloc(30, GFP_ATOMIC);
+	if (buf) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->timestamp));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+						  &iwe, buf);
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, " Last beacon: %dms ago",
+			jiffies_to_msecs(jiffies - bss->last_update));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev,
+						  end_buf, &iwe, buf);
+		kfree(buf);
 	}
 
 	if (bss_mesh_cfg(bss)) {
-		char *buf;
 		u8 *cfg = bss_mesh_cfg(bss);
 		buf = kmalloc(50, GFP_ATOMIC);
 		if (buf) {
-- 
cgit v1.2.3


From 0a51b27e956bd9580296c48191b78175ed8b5971 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:25 +0200
Subject: mac80211: start moving scan code from mlme

Here's a first patch to move some code from mlme.c to a
new file called scan.c. The end result will hopefully be
a more manageable mlme.c.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/Makefile      |   1 +
 net/mac80211/ieee80211_i.h |  13 +-
 net/mac80211/mlme.c        | 559 ++-------------------------------------------
 net/mac80211/scan.c        | 553 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 582 insertions(+), 544 deletions(-)
 create mode 100644 net/mac80211/scan.c

(limited to 'net')

diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index a169b0201d6..376280a420a 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -7,6 +7,7 @@ mac80211-y := \
 	sta_info.o \
 	wep.o \
 	wpa.o \
+	scan.o \
 	mlme.o \
 	iface.o \
 	rate.o \
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a33bbd1ca2b..25dccd5cb2f 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -53,6 +53,12 @@ struct ieee80211_local;
  * increased memory use (about 2 kB of RAM per entry). */
 #define IEEE80211_FRAGMENT_MAX 4
 
+/*
+ * Time after which we ignore scan results and no longer report/use
+ * them in any way.
+ */
+#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ)
+
 struct ieee80211_fragment_entry {
 	unsigned long first_frag_time;
 	unsigned int seq;
@@ -924,8 +930,13 @@ u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 			    enum ieee80211_band band);
 void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		int encrypt);
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+			      u8 *ssid, size_t ssid_len);
 void ieee802_11_parse_elems(u8 *start, size_t len,
-				   struct ieee802_11_elems *elems);
+			    struct ieee802_11_elems *elems);
+void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
+int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
+			     u8 *ssid, size_t ssid_len);
 
 #ifdef CONFIG_MAC80211_MESH
 void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ba502ce132d..2caea9759b7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -46,10 +46,6 @@
 #define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
 #define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
 
-#define IEEE80211_PROBE_DELAY (HZ / 33)
-#define IEEE80211_CHANNEL_TIME (HZ / 33)
-#define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 5)
-#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ)
 #define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
 #define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
 #define IEEE80211_MESH_PEER_INACTIVITY_LIMIT (1800 * HZ)
@@ -341,8 +337,8 @@ static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	ieee80211_sta_tx(sdata, skb, encrypt);
 }
 
-static void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
-				     u8 *ssid, size_t ssid_len)
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+			      u8 *ssid, size_t ssid_len)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
@@ -3466,543 +3462,6 @@ int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
 }
 
 
-static void ieee80211_send_nullfunc(struct ieee80211_local *local,
-				    struct ieee80211_sub_if_data *sdata,
-				    int powersave)
-{
-	struct sk_buff *skb;
-	struct ieee80211_hdr *nullfunc;
-	__le16 fc;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
-		       "frame\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24);
-	memset(nullfunc, 0, 24);
-	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
-			 IEEE80211_FCTL_TODS);
-	if (powersave)
-		fc |= cpu_to_le16(IEEE80211_FCTL_PM);
-	nullfunc->frame_control = fc;
-	memcpy(nullfunc->addr1, sdata->u.sta.bssid, ETH_ALEN);
-	memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN);
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
-
-static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
-{
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    ieee80211_vif_is_mesh(&sdata->vif))
-		ieee80211_sta_timer((unsigned long)sdata);
-}
-
-void ieee80211_scan_completed(struct ieee80211_hw *hw)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct ieee80211_sub_if_data *sdata;
-	union iwreq_data wrqu;
-
-	local->last_scan_completed = jiffies;
-	memset(&wrqu, 0, sizeof(wrqu));
-	wireless_send_event(local->scan_sdata->dev, SIOCGIWSCAN, &wrqu, NULL);
-
-	if (local->sta_hw_scanning) {
-		local->sta_hw_scanning = 0;
-		if (ieee80211_hw_config(local))
-			printk(KERN_DEBUG "%s: failed to restore operational "
-			       "channel after scan\n", wiphy_name(local->hw.wiphy));
-		/* Restart STA timer for HW scan case */
-		rcu_read_lock();
-		list_for_each_entry_rcu(sdata, &local->interfaces, list)
-			ieee80211_restart_sta_timer(sdata);
-		rcu_read_unlock();
-
-		goto done;
-	}
-
-	local->sta_sw_scanning = 0;
-	if (ieee80211_hw_config(local))
-		printk(KERN_DEBUG "%s: failed to restore operational "
-		       "channel after scan\n", wiphy_name(local->hw.wiphy));
-
-
-	netif_tx_lock_bh(local->mdev);
-	netif_addr_lock(local->mdev);
-	local->filter_flags &= ~FIF_BCN_PRBRESP_PROMISC;
-	local->ops->configure_filter(local_to_hw(local),
-				     FIF_BCN_PRBRESP_PROMISC,
-				     &local->filter_flags,
-				     local->mdev->mc_count,
-				     local->mdev->mc_list);
-
-	netif_addr_unlock(local->mdev);
-	netif_tx_unlock_bh(local->mdev);
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		/* Tell AP we're back */
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
-			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
-				ieee80211_send_nullfunc(local, sdata, 0);
-				netif_tx_wake_all_queues(sdata->dev);
-			}
-		} else
-			netif_tx_wake_all_queues(sdata->dev);
-
-		ieee80211_restart_sta_timer(sdata);
-	}
-	rcu_read_unlock();
-
- done:
-	sdata = local->scan_sdata;
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
-		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
-		    (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) &&
-		    !ieee80211_sta_active_ibss(sdata)))
-			ieee80211_sta_find_ibss(sdata, ifsta);
-	}
-}
-EXPORT_SYMBOL(ieee80211_scan_completed);
-
-void ieee80211_sta_scan_work(struct work_struct *work)
-{
-	struct ieee80211_local *local =
-		container_of(work, struct ieee80211_local, scan_work.work);
-	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
-	struct ieee80211_supported_band *sband;
-	struct ieee80211_channel *chan;
-	int skip;
-	unsigned long next_delay = 0;
-
-	if (!local->sta_sw_scanning)
-		return;
-
-	switch (local->scan_state) {
-	case SCAN_SET_CHANNEL:
-		/*
-		 * Get current scan band. scan_band may be IEEE80211_NUM_BANDS
-		 * after we successfully scanned the last channel of the last
-		 * band (and the last band is supported by the hw)
-		 */
-		if (local->scan_band < IEEE80211_NUM_BANDS)
-			sband = local->hw.wiphy->bands[local->scan_band];
-		else
-			sband = NULL;
-
-		/*
-		 * If we are at an unsupported band and have more bands
-		 * left to scan, advance to the next supported one.
-		 */
-		while (!sband && local->scan_band < IEEE80211_NUM_BANDS - 1) {
-			local->scan_band++;
-			sband = local->hw.wiphy->bands[local->scan_band];
-			local->scan_channel_idx = 0;
-		}
-
-		/* if no more bands/channels left, complete scan */
-		if (!sband || local->scan_channel_idx >= sband->n_channels) {
-			ieee80211_scan_completed(local_to_hw(local));
-			return;
-		}
-		skip = 0;
-		chan = &sband->channels[local->scan_channel_idx];
-
-		if (chan->flags & IEEE80211_CHAN_DISABLED ||
-		    (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
-		     chan->flags & IEEE80211_CHAN_NO_IBSS))
-			skip = 1;
-
-		if (!skip) {
-			local->scan_channel = chan;
-			if (ieee80211_hw_config(local)) {
-				printk(KERN_DEBUG "%s: failed to set freq to "
-				       "%d MHz for scan\n", wiphy_name(local->hw.wiphy),
-				       chan->center_freq);
-				skip = 1;
-			}
-		}
-
-		/* advance state machine to next channel/band */
-		local->scan_channel_idx++;
-		if (local->scan_channel_idx >= sband->n_channels) {
-			/*
-			 * scan_band may end up == IEEE80211_NUM_BANDS, but
-			 * we'll catch that case above and complete the scan
-			 * if that is the case.
-			 */
-			local->scan_band++;
-			local->scan_channel_idx = 0;
-		}
-
-		if (skip)
-			break;
-
-		next_delay = IEEE80211_PROBE_DELAY +
-			     usecs_to_jiffies(local->hw.channel_change_time);
-		local->scan_state = SCAN_SEND_PROBE;
-		break;
-	case SCAN_SEND_PROBE:
-		next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
-		local->scan_state = SCAN_SET_CHANNEL;
-
-		if (local->scan_channel->flags & IEEE80211_CHAN_PASSIVE_SCAN)
-			break;
-		ieee80211_send_probe_req(sdata, NULL, local->scan_ssid,
-					 local->scan_ssid_len);
-		next_delay = IEEE80211_CHANNEL_TIME;
-		break;
-	}
-
-	if (local->sta_sw_scanning)
-		queue_delayed_work(local->hw.workqueue, &local->scan_work,
-				   next_delay);
-}
-
-
-static int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
-				    u8 *ssid, size_t ssid_len)
-{
-	struct ieee80211_local *local = scan_sdata->local;
-	struct ieee80211_sub_if_data *sdata;
-
-	if (ssid_len > IEEE80211_MAX_SSID_LEN)
-		return -EINVAL;
-
-	/* MLME-SCAN.request (page 118)  page 144 (11.1.3.1)
-	 * BSSType: INFRASTRUCTURE, INDEPENDENT, ANY_BSS
-	 * BSSID: MACAddress
-	 * SSID
-	 * ScanType: ACTIVE, PASSIVE
-	 * ProbeDelay: delay (in microseconds) to be used prior to transmitting
-	 *    a Probe frame during active scanning
-	 * ChannelList
-	 * MinChannelTime (>= ProbeDelay), in TU
-	 * MaxChannelTime: (>= MinChannelTime), in TU
-	 */
-
-	 /* MLME-SCAN.confirm
-	  * BSSDescriptionSet
-	  * ResultCode: SUCCESS, INVALID_PARAMETERS
-	 */
-
-	if (local->sta_sw_scanning || local->sta_hw_scanning) {
-		if (local->scan_sdata == scan_sdata)
-			return 0;
-		return -EBUSY;
-	}
-
-	if (local->ops->hw_scan) {
-		int rc = local->ops->hw_scan(local_to_hw(local),
-					     ssid, ssid_len);
-		if (!rc) {
-			local->sta_hw_scanning = 1;
-			local->scan_sdata = scan_sdata;
-		}
-		return rc;
-	}
-
-	local->sta_sw_scanning = 1;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
-			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
-				netif_tx_stop_all_queues(sdata->dev);
-				ieee80211_send_nullfunc(local, sdata, 1);
-			}
-		} else
-			netif_tx_stop_all_queues(sdata->dev);
-	}
-	rcu_read_unlock();
-
-	if (ssid) {
-		local->scan_ssid_len = ssid_len;
-		memcpy(local->scan_ssid, ssid, ssid_len);
-	} else
-		local->scan_ssid_len = 0;
-	local->scan_state = SCAN_SET_CHANNEL;
-	local->scan_channel_idx = 0;
-	local->scan_band = IEEE80211_BAND_2GHZ;
-	local->scan_sdata = scan_sdata;
-
-	netif_addr_lock_bh(local->mdev);
-	local->filter_flags |= FIF_BCN_PRBRESP_PROMISC;
-	local->ops->configure_filter(local_to_hw(local),
-				     FIF_BCN_PRBRESP_PROMISC,
-				     &local->filter_flags,
-				     local->mdev->mc_count,
-				     local->mdev->mc_list);
-	netif_addr_unlock_bh(local->mdev);
-
-	/* TODO: start scan as soon as all nullfunc frames are ACKed */
-	queue_delayed_work(local->hw.workqueue, &local->scan_work,
-			   IEEE80211_CHANNEL_TIME);
-
-	return 0;
-}
-
-
-int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len)
-{
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct ieee80211_local *local = sdata->local;
-
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
-		return ieee80211_sta_start_scan(sdata, ssid, ssid_len);
-
-	if (local->sta_sw_scanning || local->sta_hw_scanning) {
-		if (local->scan_sdata == sdata)
-			return 0;
-		return -EBUSY;
-	}
-
-	ifsta->scan_ssid_len = ssid_len;
-	if (ssid_len)
-		memcpy(ifsta->scan_ssid, ssid, ssid_len);
-	set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request);
-	queue_work(local->hw.workqueue, &ifsta->work);
-	return 0;
-}
-
-
-static void ieee80211_sta_add_scan_ies(struct iw_request_info *info,
-				       struct ieee80211_sta_bss *bss,
-				       char **current_ev, char *end_buf)
-{
-	u8 *pos, *end, *next;
-	struct iw_event iwe;
-
-	if (bss == NULL || bss->ies == NULL)
-		return;
-
-	/*
-	 * If needed, fragment the IEs buffer (at IE boundaries) into short
-	 * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
-	 */
-	pos = bss->ies;
-	end = pos + bss->ies_len;
-
-	while (end - pos > IW_GENERIC_IE_MAX) {
-		next = pos + 2 + pos[1];
-		while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
-			next = next + 2 + next[1];
-
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVGENIE;
-		iwe.u.data.length = next - pos;
-		*current_ev = iwe_stream_add_point(info, *current_ev,
-						   end_buf, &iwe, pos);
-
-		pos = next;
-	}
-
-	if (end > pos) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVGENIE;
-		iwe.u.data.length = end - pos;
-		*current_ev = iwe_stream_add_point(info, *current_ev,
-						   end_buf, &iwe, pos);
-	}
-}
-
-
-static char *
-ieee80211_sta_scan_result(struct ieee80211_local *local,
-			  struct iw_request_info *info,
-			  struct ieee80211_sta_bss *bss,
-			  char *current_ev, char *end_buf)
-{
-	struct iw_event iwe;
-	char *buf;
-
-	if (time_after(jiffies,
-		       bss->last_update + IEEE80211_SCAN_RESULT_EXPIRE))
-		return current_ev;
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWAP;
-	iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
-	memcpy(iwe.u.ap_addr.sa_data, bss->bssid, ETH_ALEN);
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_ADDR_LEN);
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWESSID;
-	if (bss_mesh_cfg(bss)) {
-		iwe.u.data.length = bss_mesh_id_len(bss);
-		iwe.u.data.flags = 1;
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, bss_mesh_id(bss));
-	} else {
-		iwe.u.data.length = bss->ssid_len;
-		iwe.u.data.flags = 1;
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, bss->ssid);
-	}
-
-	if (bss->capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)
-	    || bss_mesh_cfg(bss)) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = SIOCGIWMODE;
-		if (bss_mesh_cfg(bss))
-			iwe.u.mode = IW_MODE_MESH;
-		else if (bss->capability & WLAN_CAPABILITY_ESS)
-			iwe.u.mode = IW_MODE_MASTER;
-		else
-			iwe.u.mode = IW_MODE_ADHOC;
-		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
-						  &iwe, IW_EV_UINT_LEN);
-	}
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWFREQ;
-	iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq);
-	iwe.u.freq.e = 0;
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_FREQ_LEN);
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWFREQ;
-	iwe.u.freq.m = bss->freq;
-	iwe.u.freq.e = 6;
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_FREQ_LEN);
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = IWEVQUAL;
-	iwe.u.qual.qual = bss->qual;
-	iwe.u.qual.level = bss->signal;
-	iwe.u.qual.noise = bss->noise;
-	iwe.u.qual.updated = local->wstats_flags;
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_QUAL_LEN);
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWENCODE;
-	if (bss->capability & WLAN_CAPABILITY_PRIVACY)
-		iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
-	else
-		iwe.u.data.flags = IW_ENCODE_DISABLED;
-	iwe.u.data.length = 0;
-	current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-					  &iwe, "");
-
-	ieee80211_sta_add_scan_ies(info, bss, &current_ev, end_buf);
-
-	if (bss->supp_rates_len > 0) {
-		/* display all supported rates in readable format */
-		char *p = current_ev + iwe_stream_lcp_len(info);
-		int i;
-
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = SIOCGIWRATE;
-		/* Those two flags are ignored... */
-		iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
-
-		for (i = 0; i < bss->supp_rates_len; i++) {
-			iwe.u.bitrate.value = ((bss->supp_rates[i] &
-							0x7f) * 500000);
-			p = iwe_stream_add_value(info, current_ev, p,
-					end_buf, &iwe, IW_EV_PARAM_LEN);
-		}
-		current_ev = p;
-	}
-
-	buf = kmalloc(30, GFP_ATOMIC);
-	if (buf) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVCUSTOM;
-		sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->timestamp));
-		iwe.u.data.length = strlen(buf);
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, buf);
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVCUSTOM;
-		sprintf(buf, " Last beacon: %dms ago",
-			jiffies_to_msecs(jiffies - bss->last_update));
-		iwe.u.data.length = strlen(buf);
-		current_ev = iwe_stream_add_point(info, current_ev,
-						  end_buf, &iwe, buf);
-		kfree(buf);
-	}
-
-	if (bss_mesh_cfg(bss)) {
-		u8 *cfg = bss_mesh_cfg(bss);
-		buf = kmalloc(50, GFP_ATOMIC);
-		if (buf) {
-			memset(&iwe, 0, sizeof(iwe));
-			iwe.cmd = IWEVCUSTOM;
-			sprintf(buf, "Mesh network (version %d)", cfg[0]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Path Selection Protocol ID: "
-				"0x%02X%02X%02X%02X", cfg[1], cfg[2], cfg[3],
-							cfg[4]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Path Selection Metric ID: "
-				"0x%02X%02X%02X%02X", cfg[5], cfg[6], cfg[7],
-							cfg[8]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Congestion Control Mode ID: "
-				"0x%02X%02X%02X%02X", cfg[9], cfg[10],
-							cfg[11], cfg[12]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Channel Precedence: "
-				"0x%02X%02X%02X%02X", cfg[13], cfg[14],
-							cfg[15], cfg[16]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			kfree(buf);
-		}
-	}
-
-	return current_ev;
-}
-
-
-int ieee80211_sta_scan_results(struct ieee80211_local *local,
-			       struct iw_request_info *info,
-			       char *buf, size_t len)
-{
-	char *current_ev = buf;
-	char *end_buf = buf + len;
-	struct ieee80211_sta_bss *bss;
-
-	spin_lock_bh(&local->sta_bss_lock);
-	list_for_each_entry(bss, &local->sta_bss_list, list) {
-		if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
-			spin_unlock_bh(&local->sta_bss_lock);
-			return -E2BIG;
-		}
-		current_ev = ieee80211_sta_scan_result(local, info, bss,
-						       current_ev, end_buf);
-	}
-	spin_unlock_bh(&local->sta_bss_lock);
-	return current_ev - buf;
-}
-
-
 int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
 {
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
@@ -4290,3 +3749,17 @@ void ieee80211_sta_work(struct work_struct *work)
 					WLAN_REASON_UNSPECIFIED);
 	}
 }
+
+void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
+	struct ieee80211_if_sta *ifsta;
+
+	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+		ifsta = &sdata->u.sta;
+		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
+		    (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) &&
+		    !ieee80211_sta_active_ibss(sdata)))
+			ieee80211_sta_find_ibss(sdata, ifsta);
+	}
+}
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
new file mode 100644
index 00000000000..68fa782acd7
--- /dev/null
+++ b/net/mac80211/scan.c
@@ -0,0 +1,553 @@
+/*
+ * BSS client mode implementation
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/wireless.h>
+#include <linux/if_arp.h>
+#include <net/mac80211.h>
+#include <net/iw_handler.h>
+
+#include "ieee80211_i.h"
+
+#define IEEE80211_PROBE_DELAY (HZ / 33)
+#define IEEE80211_CHANNEL_TIME (HZ / 33)
+#define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 5)
+
+
+static void ieee80211_send_nullfunc(struct ieee80211_local *local,
+				    struct ieee80211_sub_if_data *sdata,
+				    int powersave)
+{
+	struct sk_buff *skb;
+	struct ieee80211_hdr *nullfunc;
+	__le16 fc;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24);
+	memset(nullfunc, 0, 24);
+	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
+			 IEEE80211_FCTL_TODS);
+	if (powersave)
+		fc |= cpu_to_le16(IEEE80211_FCTL_PM);
+	nullfunc->frame_control = fc;
+	memcpy(nullfunc->addr1, sdata->u.sta.bssid, ETH_ALEN);
+	memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
+{
+	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
+	    ieee80211_vif_is_mesh(&sdata->vif))
+		ieee80211_sta_timer((unsigned long)sdata);
+}
+
+void ieee80211_scan_completed(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+	union iwreq_data wrqu;
+
+	local->last_scan_completed = jiffies;
+	memset(&wrqu, 0, sizeof(wrqu));
+	wireless_send_event(local->scan_sdata->dev, SIOCGIWSCAN, &wrqu, NULL);
+
+	if (local->sta_hw_scanning) {
+		local->sta_hw_scanning = 0;
+		if (ieee80211_hw_config(local))
+			printk(KERN_DEBUG "%s: failed to restore operational "
+			       "channel after scan\n", wiphy_name(local->hw.wiphy));
+		/* Restart STA timer for HW scan case */
+		rcu_read_lock();
+		list_for_each_entry_rcu(sdata, &local->interfaces, list)
+			ieee80211_restart_sta_timer(sdata);
+		rcu_read_unlock();
+
+		goto done;
+	}
+
+	local->sta_sw_scanning = 0;
+	if (ieee80211_hw_config(local))
+		printk(KERN_DEBUG "%s: failed to restore operational "
+		       "channel after scan\n", wiphy_name(local->hw.wiphy));
+
+
+	netif_tx_lock_bh(local->mdev);
+	netif_addr_lock(local->mdev);
+	local->filter_flags &= ~FIF_BCN_PRBRESP_PROMISC;
+	local->ops->configure_filter(local_to_hw(local),
+				     FIF_BCN_PRBRESP_PROMISC,
+				     &local->filter_flags,
+				     local->mdev->mc_count,
+				     local->mdev->mc_list);
+
+	netif_addr_unlock(local->mdev);
+	netif_tx_unlock_bh(local->mdev);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		/* Tell AP we're back */
+		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+				ieee80211_send_nullfunc(local, sdata, 0);
+				netif_tx_wake_all_queues(sdata->dev);
+			}
+		} else
+			netif_tx_wake_all_queues(sdata->dev);
+
+		ieee80211_restart_sta_timer(sdata);
+	}
+	rcu_read_unlock();
+
+ done:
+	ieee80211_mlme_notify_scan_completed(local);
+}
+EXPORT_SYMBOL(ieee80211_scan_completed);
+
+
+void ieee80211_sta_scan_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, scan_work.work);
+	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_channel *chan;
+	int skip;
+	unsigned long next_delay = 0;
+
+	if (!local->sta_sw_scanning)
+		return;
+
+	switch (local->scan_state) {
+	case SCAN_SET_CHANNEL:
+		/*
+		 * Get current scan band. scan_band may be IEEE80211_NUM_BANDS
+		 * after we successfully scanned the last channel of the last
+		 * band (and the last band is supported by the hw)
+		 */
+		if (local->scan_band < IEEE80211_NUM_BANDS)
+			sband = local->hw.wiphy->bands[local->scan_band];
+		else
+			sband = NULL;
+
+		/*
+		 * If we are at an unsupported band and have more bands
+		 * left to scan, advance to the next supported one.
+		 */
+		while (!sband && local->scan_band < IEEE80211_NUM_BANDS - 1) {
+			local->scan_band++;
+			sband = local->hw.wiphy->bands[local->scan_band];
+			local->scan_channel_idx = 0;
+		}
+
+		/* if no more bands/channels left, complete scan */
+		if (!sband || local->scan_channel_idx >= sband->n_channels) {
+			ieee80211_scan_completed(local_to_hw(local));
+			return;
+		}
+		skip = 0;
+		chan = &sband->channels[local->scan_channel_idx];
+
+		if (chan->flags & IEEE80211_CHAN_DISABLED ||
+		    (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+		     chan->flags & IEEE80211_CHAN_NO_IBSS))
+			skip = 1;
+
+		if (!skip) {
+			local->scan_channel = chan;
+			if (ieee80211_hw_config(local)) {
+				printk(KERN_DEBUG "%s: failed to set freq to "
+				       "%d MHz for scan\n", wiphy_name(local->hw.wiphy),
+				       chan->center_freq);
+				skip = 1;
+			}
+		}
+
+		/* advance state machine to next channel/band */
+		local->scan_channel_idx++;
+		if (local->scan_channel_idx >= sband->n_channels) {
+			/*
+			 * scan_band may end up == IEEE80211_NUM_BANDS, but
+			 * we'll catch that case above and complete the scan
+			 * if that is the case.
+			 */
+			local->scan_band++;
+			local->scan_channel_idx = 0;
+		}
+
+		if (skip)
+			break;
+
+		next_delay = IEEE80211_PROBE_DELAY +
+			     usecs_to_jiffies(local->hw.channel_change_time);
+		local->scan_state = SCAN_SEND_PROBE;
+		break;
+	case SCAN_SEND_PROBE:
+		next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
+		local->scan_state = SCAN_SET_CHANNEL;
+
+		if (local->scan_channel->flags & IEEE80211_CHAN_PASSIVE_SCAN)
+			break;
+		ieee80211_send_probe_req(sdata, NULL, local->scan_ssid,
+					 local->scan_ssid_len);
+		next_delay = IEEE80211_CHANNEL_TIME;
+		break;
+	}
+
+	if (local->sta_sw_scanning)
+		queue_delayed_work(local->hw.workqueue, &local->scan_work,
+				   next_delay);
+}
+
+
+int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
+			     u8 *ssid, size_t ssid_len)
+{
+	struct ieee80211_local *local = scan_sdata->local;
+	struct ieee80211_sub_if_data *sdata;
+
+	if (ssid_len > IEEE80211_MAX_SSID_LEN)
+		return -EINVAL;
+
+	/* MLME-SCAN.request (page 118)  page 144 (11.1.3.1)
+	 * BSSType: INFRASTRUCTURE, INDEPENDENT, ANY_BSS
+	 * BSSID: MACAddress
+	 * SSID
+	 * ScanType: ACTIVE, PASSIVE
+	 * ProbeDelay: delay (in microseconds) to be used prior to transmitting
+	 *    a Probe frame during active scanning
+	 * ChannelList
+	 * MinChannelTime (>= ProbeDelay), in TU
+	 * MaxChannelTime: (>= MinChannelTime), in TU
+	 */
+
+	 /* MLME-SCAN.confirm
+	  * BSSDescriptionSet
+	  * ResultCode: SUCCESS, INVALID_PARAMETERS
+	 */
+
+	if (local->sta_sw_scanning || local->sta_hw_scanning) {
+		if (local->scan_sdata == scan_sdata)
+			return 0;
+		return -EBUSY;
+	}
+
+	if (local->ops->hw_scan) {
+		int rc = local->ops->hw_scan(local_to_hw(local),
+					     ssid, ssid_len);
+		if (!rc) {
+			local->sta_hw_scanning = 1;
+			local->scan_sdata = scan_sdata;
+		}
+		return rc;
+	}
+
+	local->sta_sw_scanning = 1;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+				netif_tx_stop_all_queues(sdata->dev);
+				ieee80211_send_nullfunc(local, sdata, 1);
+			}
+		} else
+			netif_tx_stop_all_queues(sdata->dev);
+	}
+	rcu_read_unlock();
+
+	if (ssid) {
+		local->scan_ssid_len = ssid_len;
+		memcpy(local->scan_ssid, ssid, ssid_len);
+	} else
+		local->scan_ssid_len = 0;
+	local->scan_state = SCAN_SET_CHANNEL;
+	local->scan_channel_idx = 0;
+	local->scan_band = IEEE80211_BAND_2GHZ;
+	local->scan_sdata = scan_sdata;
+
+	netif_addr_lock_bh(local->mdev);
+	local->filter_flags |= FIF_BCN_PRBRESP_PROMISC;
+	local->ops->configure_filter(local_to_hw(local),
+				     FIF_BCN_PRBRESP_PROMISC,
+				     &local->filter_flags,
+				     local->mdev->mc_count,
+				     local->mdev->mc_list);
+	netif_addr_unlock_bh(local->mdev);
+
+	/* TODO: start scan as soon as all nullfunc frames are ACKed */
+	queue_delayed_work(local->hw.workqueue, &local->scan_work,
+			   IEEE80211_CHANNEL_TIME);
+
+	return 0;
+}
+
+
+int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len)
+{
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_local *local = sdata->local;
+
+	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+		return ieee80211_sta_start_scan(sdata, ssid, ssid_len);
+
+	if (local->sta_sw_scanning || local->sta_hw_scanning) {
+		if (local->scan_sdata == sdata)
+			return 0;
+		return -EBUSY;
+	}
+
+	ifsta->scan_ssid_len = ssid_len;
+	if (ssid_len)
+		memcpy(ifsta->scan_ssid, ssid, ssid_len);
+	set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request);
+	queue_work(local->hw.workqueue, &ifsta->work);
+	return 0;
+}
+
+
+static void ieee80211_sta_add_scan_ies(struct iw_request_info *info,
+				       struct ieee80211_sta_bss *bss,
+				       char **current_ev, char *end_buf)
+{
+	u8 *pos, *end, *next;
+	struct iw_event iwe;
+
+	if (bss == NULL || bss->ies == NULL)
+		return;
+
+	/*
+	 * If needed, fragment the IEs buffer (at IE boundaries) into short
+	 * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
+	 */
+	pos = bss->ies;
+	end = pos + bss->ies_len;
+
+	while (end - pos > IW_GENERIC_IE_MAX) {
+		next = pos + 2 + pos[1];
+		while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
+			next = next + 2 + next[1];
+
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = next - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+
+		pos = next;
+	}
+
+	if (end > pos) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = end - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+	}
+}
+
+
+static char *
+ieee80211_sta_scan_result(struct ieee80211_local *local,
+			  struct iw_request_info *info,
+			  struct ieee80211_sta_bss *bss,
+			  char *current_ev, char *end_buf)
+{
+	struct iw_event iwe;
+	char *buf;
+
+	if (time_after(jiffies,
+		       bss->last_update + IEEE80211_SCAN_RESULT_EXPIRE))
+		return current_ev;
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWAP;
+	iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
+	memcpy(iwe.u.ap_addr.sa_data, bss->bssid, ETH_ALEN);
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_ADDR_LEN);
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWESSID;
+	if (bss_mesh_cfg(bss)) {
+		iwe.u.data.length = bss_mesh_id_len(bss);
+		iwe.u.data.flags = 1;
+		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+						  &iwe, bss_mesh_id(bss));
+	} else {
+		iwe.u.data.length = bss->ssid_len;
+		iwe.u.data.flags = 1;
+		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+						  &iwe, bss->ssid);
+	}
+
+	if (bss->capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)
+	    || bss_mesh_cfg(bss)) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = SIOCGIWMODE;
+		if (bss_mesh_cfg(bss))
+			iwe.u.mode = IW_MODE_MESH;
+		else if (bss->capability & WLAN_CAPABILITY_ESS)
+			iwe.u.mode = IW_MODE_MASTER;
+		else
+			iwe.u.mode = IW_MODE_ADHOC;
+		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
+						  &iwe, IW_EV_UINT_LEN);
+	}
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWFREQ;
+	iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq);
+	iwe.u.freq.e = 0;
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_FREQ_LEN);
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWFREQ;
+	iwe.u.freq.m = bss->freq;
+	iwe.u.freq.e = 6;
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_FREQ_LEN);
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = IWEVQUAL;
+	iwe.u.qual.qual = bss->qual;
+	iwe.u.qual.level = bss->signal;
+	iwe.u.qual.noise = bss->noise;
+	iwe.u.qual.updated = local->wstats_flags;
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_QUAL_LEN);
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWENCODE;
+	if (bss->capability & WLAN_CAPABILITY_PRIVACY)
+		iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
+	else
+		iwe.u.data.flags = IW_ENCODE_DISABLED;
+	iwe.u.data.length = 0;
+	current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+					  &iwe, "");
+
+	ieee80211_sta_add_scan_ies(info, bss, &current_ev, end_buf);
+
+	if (bss->supp_rates_len > 0) {
+		/* display all supported rates in readable format */
+		char *p = current_ev + iwe_stream_lcp_len(info);
+		int i;
+
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = SIOCGIWRATE;
+		/* Those two flags are ignored... */
+		iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
+
+		for (i = 0; i < bss->supp_rates_len; i++) {
+			iwe.u.bitrate.value = ((bss->supp_rates[i] &
+							0x7f) * 500000);
+			p = iwe_stream_add_value(info, current_ev, p,
+					end_buf, &iwe, IW_EV_PARAM_LEN);
+		}
+		current_ev = p;
+	}
+
+	buf = kmalloc(30, GFP_ATOMIC);
+	if (buf) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->timestamp));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+						  &iwe, buf);
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, " Last beacon: %dms ago",
+			jiffies_to_msecs(jiffies - bss->last_update));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev,
+						  end_buf, &iwe, buf);
+		kfree(buf);
+	}
+
+	if (bss_mesh_cfg(bss)) {
+		u8 *cfg = bss_mesh_cfg(bss);
+		buf = kmalloc(50, GFP_ATOMIC);
+		if (buf) {
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = IWEVCUSTOM;
+			sprintf(buf, "Mesh network (version %d)", cfg[0]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Path Selection Protocol ID: "
+				"0x%02X%02X%02X%02X", cfg[1], cfg[2], cfg[3],
+							cfg[4]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Path Selection Metric ID: "
+				"0x%02X%02X%02X%02X", cfg[5], cfg[6], cfg[7],
+							cfg[8]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Congestion Control Mode ID: "
+				"0x%02X%02X%02X%02X", cfg[9], cfg[10],
+							cfg[11], cfg[12]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Channel Precedence: "
+				"0x%02X%02X%02X%02X", cfg[13], cfg[14],
+							cfg[15], cfg[16]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			kfree(buf);
+		}
+	}
+
+	return current_ev;
+}
+
+
+int ieee80211_sta_scan_results(struct ieee80211_local *local,
+			       struct iw_request_info *info,
+			       char *buf, size_t len)
+{
+	char *current_ev = buf;
+	char *end_buf = buf + len;
+	struct ieee80211_sta_bss *bss;
+
+	spin_lock_bh(&local->sta_bss_lock);
+	list_for_each_entry(bss, &local->sta_bss_list, list) {
+		if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
+			spin_unlock_bh(&local->sta_bss_lock);
+			return -E2BIG;
+		}
+		current_ev = ieee80211_sta_scan_result(local, info, bss,
+						       current_ev, end_buf);
+	}
+	spin_unlock_bh(&local->sta_bss_lock);
+	return current_ev - buf;
+}
-- 
cgit v1.2.3


From 98c8fccfaea838e62ffde2f2e44568844e0e5472 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:26 +0200
Subject: mac80211: refactor and move scan RX code

This patch refactors some code and moves the scan RX function
to scan.c. More importantly, however, it changes it so that the
MLME's beacon/probe_resp functions aren't invoked when scanning
so that we can remove a "if (scanning)" conditions from two
places.

There's a very slight behavioural change in this patch: now,
when scanning, IBSS and mesh aren't updated even on the same
channel.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |   9 ++
 net/mac80211/mlme.c        | 253 +++++++++++++++++++++------------------------
 net/mac80211/scan.c        |  68 ++++++++++++
 3 files changed, 194 insertions(+), 136 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 25dccd5cb2f..4753ed3f3f1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -937,6 +937,15 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
 int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 			     u8 *ssid, size_t ssid_len);
+struct ieee80211_sta_bss *
+ieee80211_bss_info_update(struct ieee80211_local *local,
+			  struct ieee80211_rx_status *rx_status,
+			  struct ieee80211_mgmt *mgmt,
+			  size_t len,
+			  struct ieee802_11_elems *elems,
+			  int freq, bool beacon);
+void ieee80211_rx_bss_put(struct ieee80211_local *local,
+			  struct ieee80211_sta_bss *bss);
 
 #ifdef CONFIG_MAC80211_MESH
 void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2caea9759b7..1708a3d1cd3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -128,10 +128,9 @@ static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
 }
 
 static struct ieee80211_sta_bss *
-ieee80211_rx_bss_add(struct ieee80211_sub_if_data *sdata, u8 *bssid, int freq,
+ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len)
 {
-	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sta_bss *bss;
 
 	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
@@ -230,8 +229,8 @@ static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
 	kfree(bss);
 }
 
-static void ieee80211_rx_bss_put(struct ieee80211_local *local,
-				 struct ieee80211_sta_bss *bss)
+void ieee80211_rx_bss_put(struct ieee80211_local *local,
+			  struct ieee80211_sta_bss *bss)
 {
 	local_bh_disable();
 	if (!atomic_dec_and_lock(&bss->users, &local->sta_bss_lock)) {
@@ -2443,74 +2442,16 @@ static u64 ieee80211_sta_get_mandatory_rates(struct ieee80211_local *local,
 	return mandatory_rates;
 }
 
-static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
-				  struct ieee80211_mgmt *mgmt,
-				  size_t len,
-				  struct ieee80211_rx_status *rx_status,
-				  struct ieee802_11_elems *elems)
+struct ieee80211_sta_bss *
+ieee80211_bss_info_update(struct ieee80211_local *local,
+			  struct ieee80211_rx_status *rx_status,
+			  struct ieee80211_mgmt *mgmt,
+			  size_t len,
+			  struct ieee802_11_elems *elems,
+			  int freq, bool beacon)
 {
-	struct ieee80211_local *local = sdata->local;
-	int freq, clen;
 	struct ieee80211_sta_bss *bss;
-	struct sta_info *sta;
-	struct ieee80211_channel *channel;
-	u64 beacon_timestamp, rx_timestamp;
-	u64 supp_rates = 0;
-	bool beacon = ieee80211_is_beacon(mgmt->frame_control);
-	enum ieee80211_band band = rx_status->band;
-	DECLARE_MAC_BUF(mac);
-	DECLARE_MAC_BUF(mac2);
-
-	if (elems->ds_params && elems->ds_params_len == 1)
-		freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
-	else
-		freq = rx_status->freq;
-
-	channel = ieee80211_get_channel(local->hw.wiphy, freq);
-
-	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
-		return;
-
-	if (ieee80211_vif_is_mesh(&sdata->vif) && elems->mesh_id &&
-	    elems->mesh_config && mesh_matches_local(elems, sdata)) {
-		supp_rates = ieee80211_sta_get_rates(local, elems, band);
-
-		mesh_neighbour_update(mgmt->sa, supp_rates, sdata,
-				      mesh_peer_accepts_plinks(elems));
-	}
-
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && elems->supp_rates &&
-	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
-		supp_rates = ieee80211_sta_get_rates(local, elems, band);
-
-		rcu_read_lock();
-
-		sta = sta_info_get(local, mgmt->sa);
-		if (sta) {
-			u64 prev_rates;
-
-			prev_rates = sta->supp_rates[band];
-			/* make sure mandatory rates are always added */
-			sta->supp_rates[band] = supp_rates |
-				ieee80211_sta_get_mandatory_rates(local, band);
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-			if (sta->supp_rates[band] != prev_rates)
-				printk(KERN_DEBUG "%s: updated supp_rates set "
-				    "for %s based on beacon info (0x%llx | "
-				    "0x%llx -> 0x%llx)\n",
-				    sdata->dev->name, print_mac(mac, sta->addr),
-				    (unsigned long long) prev_rates,
-				    (unsigned long long) supp_rates,
-				    (unsigned long long) sta->supp_rates[band]);
-#endif
-		} else {
-			ieee80211_ibss_add_sta(sdata, NULL, mgmt->bssid,
-					       mgmt->sa, supp_rates);
-		}
-
-		rcu_read_unlock();
-	}
+	int clen;
 
 #ifdef CONFIG_MAC80211_MESH
 	if (elems->mesh_config)
@@ -2528,10 +2469,10 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				elems->mesh_config_len, freq);
 		else
 #endif
-			bss = ieee80211_rx_bss_add(sdata, mgmt->bssid, freq,
+			bss = ieee80211_rx_bss_add(local, mgmt->bssid, freq,
 						  elems->ssid, elems->ssid_len);
 		if (!bss)
-			return;
+			return NULL;
 	} else {
 #if 0
 		/* TODO: order by RSSI? */
@@ -2578,17 +2519,114 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		bss->supp_rates_len += clen;
 	}
 
-	bss->band = band;
+	bss->band = rx_status->band;
 
-	beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
-
-	bss->timestamp = beacon_timestamp;
+	bss->timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
 	bss->last_update = jiffies;
 	bss->signal = rx_status->signal;
 	bss->noise = rx_status->noise;
 	bss->qual = rx_status->qual;
+	bss->wmm_used = elems->wmm_param || elems->wmm_info;
+
 	if (!beacon)
 		bss->last_probe_resp = jiffies;
+
+	/*
+	 * For probe responses, or if we don't have any information yet,
+	 * use the IEs from the beacon.
+	 */
+	if (!bss->ies || !beacon) {
+		if (bss->ies == NULL || bss->ies_len < elems->total_len) {
+			kfree(bss->ies);
+			bss->ies = kmalloc(elems->total_len, GFP_ATOMIC);
+		}
+		if (bss->ies) {
+			memcpy(bss->ies, elems->ie_start, elems->total_len);
+			bss->ies_len = elems->total_len;
+		} else
+			bss->ies_len = 0;
+	}
+
+	return bss;
+}
+
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len,
+				  struct ieee80211_rx_status *rx_status,
+				  struct ieee802_11_elems *elems,
+				  bool beacon)
+{
+	struct ieee80211_local *local = sdata->local;
+	int freq;
+	struct ieee80211_sta_bss *bss;
+	struct sta_info *sta;
+	struct ieee80211_channel *channel;
+	u64 beacon_timestamp, rx_timestamp;
+	u64 supp_rates = 0;
+	enum ieee80211_band band = rx_status->band;
+	DECLARE_MAC_BUF(mac);
+	DECLARE_MAC_BUF(mac2);
+
+	if (elems->ds_params && elems->ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return;
+
+	if (ieee80211_vif_is_mesh(&sdata->vif) && elems->mesh_id &&
+	    elems->mesh_config && mesh_matches_local(elems, sdata)) {
+		supp_rates = ieee80211_sta_get_rates(local, elems, band);
+
+		mesh_neighbour_update(mgmt->sa, supp_rates, sdata,
+				      mesh_peer_accepts_plinks(elems));
+	}
+
+	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && elems->supp_rates &&
+	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
+		supp_rates = ieee80211_sta_get_rates(local, elems, band);
+
+		rcu_read_lock();
+
+		sta = sta_info_get(local, mgmt->sa);
+		if (sta) {
+			u64 prev_rates;
+
+			prev_rates = sta->supp_rates[band];
+			/* make sure mandatory rates are always added */
+			sta->supp_rates[band] = supp_rates |
+				ieee80211_sta_get_mandatory_rates(local, band);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+			if (sta->supp_rates[band] != prev_rates)
+				printk(KERN_DEBUG "%s: updated supp_rates set "
+				    "for %s based on beacon info (0x%llx | "
+				    "0x%llx -> 0x%llx)\n",
+				    sdata->dev->name, print_mac(mac, sta->addr),
+				    (unsigned long long) prev_rates,
+				    (unsigned long long) supp_rates,
+				    (unsigned long long) sta->supp_rates[band]);
+#endif
+		} else {
+			ieee80211_ibss_add_sta(sdata, NULL, mgmt->bssid,
+					       mgmt->sa, supp_rates);
+		}
+
+		rcu_read_unlock();
+	}
+
+	bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
+					freq, beacon);
+	if (!bss)
+		return;
+
+	/* was just updated in ieee80211_bss_info_update */
+	beacon_timestamp = bss->timestamp;
+
 	/*
 	 * In STA mode, the remaining parameters should not be overridden
 	 * by beacons because they're not necessarily accurate there.
@@ -2599,21 +2637,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
-	if (bss->ies == NULL || bss->ies_len < elems->total_len) {
-		kfree(bss->ies);
-		bss->ies = kmalloc(elems->total_len, GFP_ATOMIC);
-	}
-	if (bss->ies) {
-		memcpy(bss->ies, elems->ie_start, elems->total_len);
-		bss->ies_len = elems->total_len;
-	} else
-		bss->ies_len = 0;
-
-	bss->wmm_used = elems->wmm_param || elems->wmm_info;
-
 	/* check if we need to merge IBSS */
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && beacon &&
-	    !local->sta_sw_scanning && !local->sta_hw_scanning &&
 	    bss->capability & WLAN_CAPABILITY_IBSS &&
 	    bss->freq == local->oper_channel->center_freq &&
 	    elems->ssid_len == sdata->u.sta.ssid_len &&
@@ -2690,7 +2715,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
 				&elems);
 
-	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false);
 
 	/* direct probe may be part of the association flow */
 	if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE,
@@ -2721,7 +2746,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 
 	ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems);
 
-	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true);
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 		return;
@@ -2731,15 +2756,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	    memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
 		return;
 
-	/* Do not send changes to driver if we are scanning. This removes
-	 * requirement that a driver's bss_info_changed/conf_tx functions
-	 * need to be atomic.
-	 * This is really ugly code, we should rewrite scanning and make
-	 * all this more understandable for humans.
-	 */
-	if (local->sta_sw_scanning || local->sta_hw_scanning)
-		return;
-
 	ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
 				 elems.wmm_param_len);
 
@@ -2982,41 +2998,6 @@ static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 }
 
 
-ieee80211_rx_result
-ieee80211_sta_rx_scan(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-		      struct ieee80211_rx_status *rx_status)
-{
-	struct ieee80211_mgmt *mgmt;
-	__le16 fc;
-
-	if (skb->len < 2)
-		return RX_DROP_UNUSABLE;
-
-	mgmt = (struct ieee80211_mgmt *) skb->data;
-	fc = mgmt->frame_control;
-
-	if (ieee80211_is_ctl(fc))
-		return RX_CONTINUE;
-
-	if (skb->len < 24)
-		return RX_DROP_MONITOR;
-
-	if (ieee80211_is_probe_resp(fc)) {
-		ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, rx_status);
-		dev_kfree_skb(skb);
-		return RX_QUEUED;
-	}
-
-	if (ieee80211_is_beacon(fc)) {
-		ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, rx_status);
-		dev_kfree_skb(skb);
-		return RX_QUEUED;
-	}
-
-	return RX_CONTINUE;
-}
-
-
 static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_local *local = sdata->local;
@@ -3233,7 +3214,7 @@ static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata,
 	printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %s\n",
 	       sdata->dev->name, print_mac(mac, bssid));
 
-	bss = ieee80211_rx_bss_add(sdata, bssid,
+	bss = ieee80211_rx_bss_add(local, bssid,
 				   local->hw.conf.channel->center_freq,
 				   sdata->u.sta.ssid, sdata->u.sta.ssid_len);
 	if (!bss)
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 68fa782acd7..2848ba3a08e 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -23,6 +23,74 @@
 #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 5)
 
 
+ieee80211_rx_result
+ieee80211_sta_rx_scan(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		      struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_sta_bss *bss;
+	u8 *elements;
+	struct ieee80211_channel *channel;
+	size_t baselen;
+	int freq;
+	__le16 fc;
+	bool presp, beacon = false;
+	struct ieee802_11_elems elems;
+
+	if (skb->len < 2)
+		return RX_DROP_UNUSABLE;
+
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = mgmt->frame_control;
+
+	if (ieee80211_is_ctl(fc))
+		return RX_CONTINUE;
+
+	if (skb->len < 24)
+		return RX_DROP_MONITOR;
+
+	presp = ieee80211_is_probe_resp(fc);
+	if (presp) {
+		/* ignore ProbeResp to foreign address */
+		if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
+			return RX_DROP_MONITOR;
+
+		presp = true;
+		elements = mgmt->u.probe_resp.variable;
+		baselen = offsetof(struct ieee80211_mgmt, u.probe_resp.variable);
+	} else {
+		beacon = ieee80211_is_beacon(fc);
+		baselen = offsetof(struct ieee80211_mgmt, u.beacon.variable);
+		elements = mgmt->u.beacon.variable;
+	}
+
+	if (!presp && !beacon)
+		return RX_CONTINUE;
+
+	if (baselen > skb->len)
+		return RX_DROP_MONITOR;
+
+	ieee802_11_parse_elems(elements, skb->len - baselen, &elems);
+
+	if (elems.ds_params && elems.ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems.ds_params[0]);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(sdata->local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return RX_DROP_MONITOR;
+
+	bss = ieee80211_bss_info_update(sdata->local, rx_status,
+					mgmt, skb->len, &elems,
+					freq, beacon);
+	ieee80211_rx_bss_put(sdata->local, bss);
+
+	dev_kfree_skb(skb);
+	return RX_QUEUED;
+}
+
 static void ieee80211_send_nullfunc(struct ieee80211_local *local,
 				    struct ieee80211_sub_if_data *sdata,
 				    int powersave)
-- 
cgit v1.2.3


From 5484e23749e78d5a4f56928efaf3c4b0d862b7a6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:27 +0200
Subject: mac80211: move BSS handling to scan code

This moves all the BSS list handling out of mlme.c to scan.c,
no further changes except fixing kzalloc/atomic_inc/atomic_inc
to kzalloc/atomic_set(2).

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |   6 +
 net/mac80211/mlme.c        | 310 +--------------------------------------------
 net/mac80211/scan.c        | 305 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 313 insertions(+), 308 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 4753ed3f3f1..792c09ca08e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -944,6 +944,12 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 			  size_t len,
 			  struct ieee802_11_elems *elems,
 			  int freq, bool beacon);
+struct ieee80211_sta_bss *
+ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
+		     u8 *ssid, u8 ssid_len);
+struct ieee80211_sta_bss *
+ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
+		     u8 *ssid, u8 ssid_len);
 void ieee80211_rx_bss_put(struct ieee80211_local *local,
 			  struct ieee80211_sta_bss *bss);
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 1708a3d1cd3..be3292bfabe 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -11,11 +11,6 @@
  * published by the Free Software Foundation.
  */
 
-/* TODO:
- * order BSS list by RSSI(?) ("quality of AP")
- * scan result table filtering (by capability (privacy, IBSS/BSS, WPA/RSN IE,
- *    SSID)
- */
 #include <linux/delay.h>
 #include <linux/if_ether.h>
 #include <linux/skbuff.h>
@@ -67,195 +62,10 @@
 #define IEEE80211_MIN_AMPDU_BUF 0x8
 #define IEEE80211_MAX_AMPDU_BUF 0x40
 
-/* BSS handling */
-static struct ieee80211_sta_bss *
-ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
-		     u8 *ssid, u8 ssid_len)
-{
-	struct ieee80211_sta_bss *bss;
-
-	spin_lock_bh(&local->sta_bss_lock);
-	bss = local->sta_bss_hash[STA_HASH(bssid)];
-	while (bss) {
-		if (!bss_mesh_cfg(bss) &&
-		    !memcmp(bss->bssid, bssid, ETH_ALEN) &&
-		    bss->freq == freq &&
-		    bss->ssid_len == ssid_len &&
-		    (ssid_len == 0 || !memcmp(bss->ssid, ssid, ssid_len))) {
-			atomic_inc(&bss->users);
-			break;
-		}
-		bss = bss->hnext;
-	}
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
-
-/* Caller must hold local->sta_bss_lock */
-static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local,
-					struct ieee80211_sta_bss *bss)
-{
-	u8 hash_idx;
-
-	if (bss_mesh_cfg(bss))
-		hash_idx = mesh_id_hash(bss_mesh_id(bss),
-					bss_mesh_id_len(bss));
-	else
-		hash_idx = STA_HASH(bss->bssid);
-
-	bss->hnext = local->sta_bss_hash[hash_idx];
-	local->sta_bss_hash[hash_idx] = bss;
-}
-
-/* Caller must hold local->sta_bss_lock */
-static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
-					struct ieee80211_sta_bss *bss)
-{
-	struct ieee80211_sta_bss *b, *prev = NULL;
-	b = local->sta_bss_hash[STA_HASH(bss->bssid)];
-	while (b) {
-		if (b == bss) {
-			if (!prev)
-				local->sta_bss_hash[STA_HASH(bss->bssid)] =
-					bss->hnext;
-			else
-				prev->hnext = bss->hnext;
-			break;
-		}
-		prev = b;
-		b = b->hnext;
-	}
-}
-
-static struct ieee80211_sta_bss *
-ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
-		     u8 *ssid, u8 ssid_len)
-{
-	struct ieee80211_sta_bss *bss;
-
-	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
-	if (!bss)
-		return NULL;
-	atomic_inc(&bss->users);
-	atomic_inc(&bss->users);
-	memcpy(bss->bssid, bssid, ETH_ALEN);
-	bss->freq = freq;
-	if (ssid && ssid_len <= IEEE80211_MAX_SSID_LEN) {
-		memcpy(bss->ssid, ssid, ssid_len);
-		bss->ssid_len = ssid_len;
-	}
-
-	spin_lock_bh(&local->sta_bss_lock);
-	/* TODO: order by RSSI? */
-	list_add_tail(&bss->list, &local->sta_bss_list);
-	__ieee80211_rx_bss_hash_add(local, bss);
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
-
-#ifdef CONFIG_MAC80211_MESH
-static struct ieee80211_sta_bss *
-ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
-			  u8 *mesh_cfg, int freq)
-{
-	struct ieee80211_sta_bss *bss;
-
-	spin_lock_bh(&local->sta_bss_lock);
-	bss = local->sta_bss_hash[mesh_id_hash(mesh_id, mesh_id_len)];
-	while (bss) {
-		if (bss_mesh_cfg(bss) &&
-		    !memcmp(bss_mesh_cfg(bss), mesh_cfg, MESH_CFG_CMP_LEN) &&
-		    bss->freq == freq &&
-		    mesh_id_len == bss->mesh_id_len &&
-		    (mesh_id_len == 0 || !memcmp(bss->mesh_id, mesh_id,
-						 mesh_id_len))) {
-			atomic_inc(&bss->users);
-			break;
-		}
-		bss = bss->hnext;
-	}
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
-
-static struct ieee80211_sta_bss *
-ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
-			  u8 *mesh_cfg, int mesh_config_len, int freq)
-{
-	struct ieee80211_sta_bss *bss;
-
-	if (mesh_config_len != MESH_CFG_LEN)
-		return NULL;
-
-	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
-	if (!bss)
-		return NULL;
-
-	bss->mesh_cfg = kmalloc(MESH_CFG_CMP_LEN, GFP_ATOMIC);
-	if (!bss->mesh_cfg) {
-		kfree(bss);
-		return NULL;
-	}
-
-	if (mesh_id_len && mesh_id_len <= IEEE80211_MAX_MESH_ID_LEN) {
-		bss->mesh_id = kmalloc(mesh_id_len, GFP_ATOMIC);
-		if (!bss->mesh_id) {
-			kfree(bss->mesh_cfg);
-			kfree(bss);
-			return NULL;
-		}
-		memcpy(bss->mesh_id, mesh_id, mesh_id_len);
-	}
-
-	atomic_inc(&bss->users);
-	atomic_inc(&bss->users);
-	memcpy(bss->mesh_cfg, mesh_cfg, MESH_CFG_CMP_LEN);
-	bss->mesh_id_len = mesh_id_len;
-	bss->freq = freq;
-	spin_lock_bh(&local->sta_bss_lock);
-	/* TODO: order by RSSI? */
-	list_add_tail(&bss->list, &local->sta_bss_list);
-	__ieee80211_rx_bss_hash_add(local, bss);
-	spin_unlock_bh(&local->sta_bss_lock);
-	return bss;
-}
-#endif
-
-static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
-{
-	kfree(bss->ies);
-	kfree(bss_mesh_id(bss));
-	kfree(bss_mesh_cfg(bss));
-	kfree(bss);
-}
-
-void ieee80211_rx_bss_put(struct ieee80211_local *local,
-			  struct ieee80211_sta_bss *bss)
-{
-	local_bh_disable();
-	if (!atomic_dec_and_lock(&bss->users, &local->sta_bss_lock)) {
-		local_bh_enable();
-		return;
-	}
-
-	__ieee80211_rx_bss_hash_del(local, bss);
-	list_del(&bss->list);
-	spin_unlock_bh(&local->sta_bss_lock);
-	ieee80211_rx_bss_free(bss);
-}
-
-void ieee80211_rx_bss_list_init(struct ieee80211_local *local)
-{
-	spin_lock_init(&local->sta_bss_lock);
-	INIT_LIST_HEAD(&local->sta_bss_list);
-}
-
-void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local)
+/* utils */
+static int ecw2cw(int ecw)
 {
-	struct ieee80211_sta_bss *bss, *tmp;
-
-	list_for_each_entry_safe(bss, tmp, &local->sta_bss_list, list)
-		ieee80211_rx_bss_put(local, bss);
+	return (1 << ecw) - 1;
 }
 
 static u8 *ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
@@ -278,12 +88,6 @@ static u8 *ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
 	return NULL;
 }
 
-/* utils */
-static int ecw2cw(int ecw)
-{
-	return (1 << ecw) - 1;
-}
-
 /* frame sending functions */
 void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt)
@@ -2442,114 +2246,6 @@ static u64 ieee80211_sta_get_mandatory_rates(struct ieee80211_local *local,
 	return mandatory_rates;
 }
 
-struct ieee80211_sta_bss *
-ieee80211_bss_info_update(struct ieee80211_local *local,
-			  struct ieee80211_rx_status *rx_status,
-			  struct ieee80211_mgmt *mgmt,
-			  size_t len,
-			  struct ieee802_11_elems *elems,
-			  int freq, bool beacon)
-{
-	struct ieee80211_sta_bss *bss;
-	int clen;
-
-#ifdef CONFIG_MAC80211_MESH
-	if (elems->mesh_config)
-		bss = ieee80211_rx_mesh_bss_get(local, elems->mesh_id,
-				elems->mesh_id_len, elems->mesh_config, freq);
-	else
-#endif
-		bss = ieee80211_rx_bss_get(local, mgmt->bssid, freq,
-					   elems->ssid, elems->ssid_len);
-	if (!bss) {
-#ifdef CONFIG_MAC80211_MESH
-		if (elems->mesh_config)
-			bss = ieee80211_rx_mesh_bss_add(local, elems->mesh_id,
-				elems->mesh_id_len, elems->mesh_config,
-				elems->mesh_config_len, freq);
-		else
-#endif
-			bss = ieee80211_rx_bss_add(local, mgmt->bssid, freq,
-						  elems->ssid, elems->ssid_len);
-		if (!bss)
-			return NULL;
-	} else {
-#if 0
-		/* TODO: order by RSSI? */
-		spin_lock_bh(&local->sta_bss_lock);
-		list_move_tail(&bss->list, &local->sta_bss_list);
-		spin_unlock_bh(&local->sta_bss_lock);
-#endif
-	}
-
-	/* save the ERP value so that it is available at association time */
-	if (elems->erp_info && elems->erp_info_len >= 1) {
-		bss->erp_value = elems->erp_info[0];
-		bss->has_erp_value = 1;
-	}
-
-	bss->beacon_int = le16_to_cpu(mgmt->u.beacon.beacon_int);
-	bss->capability = le16_to_cpu(mgmt->u.beacon.capab_info);
-
-	if (elems->tim) {
-		struct ieee80211_tim_ie *tim_ie =
-			(struct ieee80211_tim_ie *)elems->tim;
-		bss->dtim_period = tim_ie->dtim_period;
-	}
-
-	/* set default value for buggy APs */
-	if (!elems->tim || bss->dtim_period == 0)
-		bss->dtim_period = 1;
-
-	bss->supp_rates_len = 0;
-	if (elems->supp_rates) {
-		clen = IEEE80211_MAX_SUPP_RATES - bss->supp_rates_len;
-		if (clen > elems->supp_rates_len)
-			clen = elems->supp_rates_len;
-		memcpy(&bss->supp_rates[bss->supp_rates_len], elems->supp_rates,
-		       clen);
-		bss->supp_rates_len += clen;
-	}
-	if (elems->ext_supp_rates) {
-		clen = IEEE80211_MAX_SUPP_RATES - bss->supp_rates_len;
-		if (clen > elems->ext_supp_rates_len)
-			clen = elems->ext_supp_rates_len;
-		memcpy(&bss->supp_rates[bss->supp_rates_len],
-		       elems->ext_supp_rates, clen);
-		bss->supp_rates_len += clen;
-	}
-
-	bss->band = rx_status->band;
-
-	bss->timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
-	bss->last_update = jiffies;
-	bss->signal = rx_status->signal;
-	bss->noise = rx_status->noise;
-	bss->qual = rx_status->qual;
-	bss->wmm_used = elems->wmm_param || elems->wmm_info;
-
-	if (!beacon)
-		bss->last_probe_resp = jiffies;
-
-	/*
-	 * For probe responses, or if we don't have any information yet,
-	 * use the IEs from the beacon.
-	 */
-	if (!bss->ies || !beacon) {
-		if (bss->ies == NULL || bss->ies_len < elems->total_len) {
-			kfree(bss->ies);
-			bss->ies = kmalloc(elems->total_len, GFP_ATOMIC);
-		}
-		if (bss->ies) {
-			memcpy(bss->ies, elems->ie_start, elems->total_len);
-			bss->ies_len = elems->total_len;
-		} else
-			bss->ies_len = 0;
-	}
-
-	return bss;
-}
-
 static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_mgmt *mgmt,
 				  size_t len,
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 2848ba3a08e..1beefb5ad6f 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -1,5 +1,6 @@
 /*
- * BSS client mode implementation
+ * Scanning implementation
+ *
  * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
  * Copyright 2004, Instant802 Networks, Inc.
  * Copyright 2005, Devicescape Software, Inc.
@@ -11,17 +12,319 @@
  * published by the Free Software Foundation.
  */
 
+/* TODO:
+ * order BSS list by RSSI(?) ("quality of AP")
+ * scan result table filtering (by capability (privacy, IBSS/BSS, WPA/RSN IE,
+ *    SSID)
+ */
+
 #include <linux/wireless.h>
 #include <linux/if_arp.h>
 #include <net/mac80211.h>
 #include <net/iw_handler.h>
 
 #include "ieee80211_i.h"
+#include "mesh.h"
 
 #define IEEE80211_PROBE_DELAY (HZ / 33)
 #define IEEE80211_CHANNEL_TIME (HZ / 33)
 #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 5)
 
+void ieee80211_rx_bss_list_init(struct ieee80211_local *local)
+{
+	spin_lock_init(&local->sta_bss_lock);
+	INIT_LIST_HEAD(&local->sta_bss_list);
+}
+
+void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local)
+{
+	struct ieee80211_sta_bss *bss, *tmp;
+
+	list_for_each_entry_safe(bss, tmp, &local->sta_bss_list, list)
+		ieee80211_rx_bss_put(local, bss);
+}
+
+struct ieee80211_sta_bss *
+ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
+		     u8 *ssid, u8 ssid_len)
+{
+	struct ieee80211_sta_bss *bss;
+
+	spin_lock_bh(&local->sta_bss_lock);
+	bss = local->sta_bss_hash[STA_HASH(bssid)];
+	while (bss) {
+		if (!bss_mesh_cfg(bss) &&
+		    !memcmp(bss->bssid, bssid, ETH_ALEN) &&
+		    bss->freq == freq &&
+		    bss->ssid_len == ssid_len &&
+		    (ssid_len == 0 || !memcmp(bss->ssid, ssid, ssid_len))) {
+			atomic_inc(&bss->users);
+			break;
+		}
+		bss = bss->hnext;
+	}
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+
+/* Caller must hold local->sta_bss_lock */
+static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local,
+					struct ieee80211_sta_bss *bss)
+{
+	u8 hash_idx;
+
+	if (bss_mesh_cfg(bss))
+		hash_idx = mesh_id_hash(bss_mesh_id(bss),
+					bss_mesh_id_len(bss));
+	else
+		hash_idx = STA_HASH(bss->bssid);
+
+	bss->hnext = local->sta_bss_hash[hash_idx];
+	local->sta_bss_hash[hash_idx] = bss;
+}
+
+/* Caller must hold local->sta_bss_lock */
+static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
+					struct ieee80211_sta_bss *bss)
+{
+	struct ieee80211_sta_bss *b, *prev = NULL;
+	b = local->sta_bss_hash[STA_HASH(bss->bssid)];
+	while (b) {
+		if (b == bss) {
+			if (!prev)
+				local->sta_bss_hash[STA_HASH(bss->bssid)] =
+					bss->hnext;
+			else
+				prev->hnext = bss->hnext;
+			break;
+		}
+		prev = b;
+		b = b->hnext;
+	}
+}
+
+struct ieee80211_sta_bss *
+ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
+		     u8 *ssid, u8 ssid_len)
+{
+	struct ieee80211_sta_bss *bss;
+
+	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
+	if (!bss)
+		return NULL;
+	atomic_set(&bss->users, 2);
+	memcpy(bss->bssid, bssid, ETH_ALEN);
+	bss->freq = freq;
+	if (ssid && ssid_len <= IEEE80211_MAX_SSID_LEN) {
+		memcpy(bss->ssid, ssid, ssid_len);
+		bss->ssid_len = ssid_len;
+	}
+
+	spin_lock_bh(&local->sta_bss_lock);
+	/* TODO: order by RSSI? */
+	list_add_tail(&bss->list, &local->sta_bss_list);
+	__ieee80211_rx_bss_hash_add(local, bss);
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+
+#ifdef CONFIG_MAC80211_MESH
+static struct ieee80211_sta_bss *
+ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
+			  u8 *mesh_cfg, int freq)
+{
+	struct ieee80211_sta_bss *bss;
+
+	spin_lock_bh(&local->sta_bss_lock);
+	bss = local->sta_bss_hash[mesh_id_hash(mesh_id, mesh_id_len)];
+	while (bss) {
+		if (bss_mesh_cfg(bss) &&
+		    !memcmp(bss_mesh_cfg(bss), mesh_cfg, MESH_CFG_CMP_LEN) &&
+		    bss->freq == freq &&
+		    mesh_id_len == bss->mesh_id_len &&
+		    (mesh_id_len == 0 || !memcmp(bss->mesh_id, mesh_id,
+						 mesh_id_len))) {
+			atomic_inc(&bss->users);
+			break;
+		}
+		bss = bss->hnext;
+	}
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+
+static struct ieee80211_sta_bss *
+ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
+			  u8 *mesh_cfg, int mesh_config_len, int freq)
+{
+	struct ieee80211_sta_bss *bss;
+
+	if (mesh_config_len != MESH_CFG_LEN)
+		return NULL;
+
+	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
+	if (!bss)
+		return NULL;
+
+	bss->mesh_cfg = kmalloc(MESH_CFG_CMP_LEN, GFP_ATOMIC);
+	if (!bss->mesh_cfg) {
+		kfree(bss);
+		return NULL;
+	}
+
+	if (mesh_id_len && mesh_id_len <= IEEE80211_MAX_MESH_ID_LEN) {
+		bss->mesh_id = kmalloc(mesh_id_len, GFP_ATOMIC);
+		if (!bss->mesh_id) {
+			kfree(bss->mesh_cfg);
+			kfree(bss);
+			return NULL;
+		}
+		memcpy(bss->mesh_id, mesh_id, mesh_id_len);
+	}
+
+	atomic_set(&bss->users, 2);
+	memcpy(bss->mesh_cfg, mesh_cfg, MESH_CFG_CMP_LEN);
+	bss->mesh_id_len = mesh_id_len;
+	bss->freq = freq;
+	spin_lock_bh(&local->sta_bss_lock);
+	/* TODO: order by RSSI? */
+	list_add_tail(&bss->list, &local->sta_bss_list);
+	__ieee80211_rx_bss_hash_add(local, bss);
+	spin_unlock_bh(&local->sta_bss_lock);
+	return bss;
+}
+#endif
+
+static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
+{
+	kfree(bss->ies);
+	kfree(bss_mesh_id(bss));
+	kfree(bss_mesh_cfg(bss));
+	kfree(bss);
+}
+
+void ieee80211_rx_bss_put(struct ieee80211_local *local,
+			  struct ieee80211_sta_bss *bss)
+{
+	local_bh_disable();
+	if (!atomic_dec_and_lock(&bss->users, &local->sta_bss_lock)) {
+		local_bh_enable();
+		return;
+	}
+
+	__ieee80211_rx_bss_hash_del(local, bss);
+	list_del(&bss->list);
+	spin_unlock_bh(&local->sta_bss_lock);
+	ieee80211_rx_bss_free(bss);
+}
+
+struct ieee80211_sta_bss *
+ieee80211_bss_info_update(struct ieee80211_local *local,
+			  struct ieee80211_rx_status *rx_status,
+			  struct ieee80211_mgmt *mgmt,
+			  size_t len,
+			  struct ieee802_11_elems *elems,
+			  int freq, bool beacon)
+{
+	struct ieee80211_sta_bss *bss;
+	int clen;
+
+#ifdef CONFIG_MAC80211_MESH
+	if (elems->mesh_config)
+		bss = ieee80211_rx_mesh_bss_get(local, elems->mesh_id,
+				elems->mesh_id_len, elems->mesh_config, freq);
+	else
+#endif
+		bss = ieee80211_rx_bss_get(local, mgmt->bssid, freq,
+					   elems->ssid, elems->ssid_len);
+	if (!bss) {
+#ifdef CONFIG_MAC80211_MESH
+		if (elems->mesh_config)
+			bss = ieee80211_rx_mesh_bss_add(local, elems->mesh_id,
+				elems->mesh_id_len, elems->mesh_config,
+				elems->mesh_config_len, freq);
+		else
+#endif
+			bss = ieee80211_rx_bss_add(local, mgmt->bssid, freq,
+						  elems->ssid, elems->ssid_len);
+		if (!bss)
+			return NULL;
+	} else {
+#if 0
+		/* TODO: order by RSSI? */
+		spin_lock_bh(&local->sta_bss_lock);
+		list_move_tail(&bss->list, &local->sta_bss_list);
+		spin_unlock_bh(&local->sta_bss_lock);
+#endif
+	}
+
+	/* save the ERP value so that it is available at association time */
+	if (elems->erp_info && elems->erp_info_len >= 1) {
+		bss->erp_value = elems->erp_info[0];
+		bss->has_erp_value = 1;
+	}
+
+	bss->beacon_int = le16_to_cpu(mgmt->u.beacon.beacon_int);
+	bss->capability = le16_to_cpu(mgmt->u.beacon.capab_info);
+
+	if (elems->tim) {
+		struct ieee80211_tim_ie *tim_ie =
+			(struct ieee80211_tim_ie *)elems->tim;
+		bss->dtim_period = tim_ie->dtim_period;
+	}
+
+	/* set default value for buggy APs */
+	if (!elems->tim || bss->dtim_period == 0)
+		bss->dtim_period = 1;
+
+	bss->supp_rates_len = 0;
+	if (elems->supp_rates) {
+		clen = IEEE80211_MAX_SUPP_RATES - bss->supp_rates_len;
+		if (clen > elems->supp_rates_len)
+			clen = elems->supp_rates_len;
+		memcpy(&bss->supp_rates[bss->supp_rates_len], elems->supp_rates,
+		       clen);
+		bss->supp_rates_len += clen;
+	}
+	if (elems->ext_supp_rates) {
+		clen = IEEE80211_MAX_SUPP_RATES - bss->supp_rates_len;
+		if (clen > elems->ext_supp_rates_len)
+			clen = elems->ext_supp_rates_len;
+		memcpy(&bss->supp_rates[bss->supp_rates_len],
+		       elems->ext_supp_rates, clen);
+		bss->supp_rates_len += clen;
+	}
+
+	bss->band = rx_status->band;
+
+	bss->timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
+	bss->last_update = jiffies;
+	bss->signal = rx_status->signal;
+	bss->noise = rx_status->noise;
+	bss->qual = rx_status->qual;
+	bss->wmm_used = elems->wmm_param || elems->wmm_info;
+
+	if (!beacon)
+		bss->last_probe_resp = jiffies;
+
+	/*
+	 * For probe responses, or if we don't have any information yet,
+	 * use the IEs from the beacon.
+	 */
+	if (!bss->ies || !beacon) {
+		if (bss->ies == NULL || bss->ies_len < elems->total_len) {
+			kfree(bss->ies);
+			bss->ies = kmalloc(elems->total_len, GFP_ATOMIC);
+		}
+		if (bss->ies) {
+			memcpy(bss->ies, elems->ie_start, elems->total_len);
+			bss->ies_len = elems->total_len;
+		} else
+			bss->ies_len = 0;
+	}
+
+	return bss;
+}
 
 ieee80211_rx_result
 ieee80211_sta_rx_scan(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-- 
cgit v1.2.3


From 44d414dbff9d5bf46fc09f2e68567b5848cbbfd3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:28 +0200
Subject: mac80211: move some HT code out of mlme.c

Some of the HT code in mlme.c is misplaced:
 * constants/definitions belong to the ieee80211.h header
 * code being used in other modes as well shouldn't be there

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/Makefile |   1 +
 net/mac80211/ht.c     | 328 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/mlme.c   | 321 ------------------------------------------------
 3 files changed, 329 insertions(+), 321 deletions(-)
 create mode 100644 net/mac80211/ht.c

(limited to 'net')

diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 376280a420a..1a7ac50910c 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -8,6 +8,7 @@ mac80211-y := \
 	wep.o \
 	wpa.o \
 	scan.o \
+	ht.o \
 	mlme.o \
 	iface.o \
 	rate.o \
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
new file mode 100644
index 00000000000..5ccf1bc1746
--- /dev/null
+++ b/net/mac80211/ht.c
@@ -0,0 +1,328 @@
+/*
+ * HT handling
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2008, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <net/wireless.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+
+int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
+				   struct ieee80211_ht_info *ht_info)
+{
+
+	if (ht_info == NULL)
+		return -EINVAL;
+
+	memset(ht_info, 0, sizeof(*ht_info));
+
+	if (ht_cap_ie) {
+		u8 ampdu_info = ht_cap_ie->ampdu_params_info;
+
+		ht_info->ht_supported = 1;
+		ht_info->cap = le16_to_cpu(ht_cap_ie->cap_info);
+		ht_info->ampdu_factor =
+			ampdu_info & IEEE80211_HT_CAP_AMPDU_FACTOR;
+		ht_info->ampdu_density =
+			(ampdu_info & IEEE80211_HT_CAP_AMPDU_DENSITY) >> 2;
+		memcpy(ht_info->supp_mcs_set, ht_cap_ie->supp_mcs_set, 16);
+	} else
+		ht_info->ht_supported = 0;
+
+	return 0;
+}
+
+int ieee80211_ht_addt_info_ie_to_ht_bss_info(
+			struct ieee80211_ht_addt_info *ht_add_info_ie,
+			struct ieee80211_ht_bss_info *bss_info)
+{
+	if (bss_info == NULL)
+		return -EINVAL;
+
+	memset(bss_info, 0, sizeof(*bss_info));
+
+	if (ht_add_info_ie) {
+		u16 op_mode;
+		op_mode = le16_to_cpu(ht_add_info_ie->operation_mode);
+
+		bss_info->primary_channel = ht_add_info_ie->control_chan;
+		bss_info->bss_cap = ht_add_info_ie->ht_param;
+		bss_info->bss_op_mode = (u8)(op_mode & 0xff);
+	}
+
+	return 0;
+}
+
+void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da,
+				u16 tid, u8 dialog_token, u16 start_seq_num,
+				u16 agg_size, u16 timeout)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u16 capab;
+
+	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer "
+				"for addba request frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
+		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
+	else
+		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req));
+
+	mgmt->u.action.category = WLAN_CATEGORY_BACK;
+	mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ;
+
+	mgmt->u.action.u.addba_req.dialog_token = dialog_token;
+	capab = (u16)(1 << 1);		/* bit 1 aggregation policy */
+	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
+	capab |= (u16)(agg_size << 6);	/* bit 15:6 max size of aggergation */
+
+	mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab);
+
+	mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout);
+	mgmt->u.action.u.addba_req.start_seq_num =
+					cpu_to_le16(start_seq_num << 4);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid,
+			  u16 initiator, u16 reason_code)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u16 params;
+
+	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer "
+					"for delba frame\n", sdata->dev->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
+		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
+	else
+		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba));
+
+	mgmt->u.action.category = WLAN_CATEGORY_BACK;
+	mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA;
+	params = (u16)(initiator << 11); 	/* bit 11 initiator */
+	params |= (u16)(tid << 12); 		/* bit 15:12 TID number */
+
+	mgmt->u.action.u.delba.params = cpu_to_le16(params);
+	mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_bar *bar;
+	u16 bar_control = 0;
+
+	skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom);
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer for "
+			"bar frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar));
+	memset(bar, 0, sizeof(*bar));
+	bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
+					 IEEE80211_STYPE_BACK_REQ);
+	memcpy(bar->ra, ra, ETH_ALEN);
+	memcpy(bar->ta, sdata->dev->dev_addr, ETH_ALEN);
+	bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL;
+	bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA;
+	bar_control |= (u16)(tid << 12);
+	bar->control = cpu_to_le16(bar_control);
+	bar->start_seq_num = cpu_to_le16(ssn);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid,
+					u16 initiator, u16 reason)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_hw *hw = &local->hw;
+	struct sta_info *sta;
+	int ret, i;
+	DECLARE_MAC_BUF(mac);
+
+	rcu_read_lock();
+
+	sta = sta_info_get(local, ra);
+	if (!sta) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* check if TID is in operational state */
+	spin_lock_bh(&sta->lock);
+	if (sta->ampdu_mlme.tid_state_rx[tid]
+				!= HT_AGG_STATE_OPERATIONAL) {
+		spin_unlock_bh(&sta->lock);
+		rcu_read_unlock();
+		return;
+	}
+	sta->ampdu_mlme.tid_state_rx[tid] =
+		HT_AGG_STATE_REQ_STOP_BA_MSK |
+		(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
+	spin_unlock_bh(&sta->lock);
+
+	/* stop HW Rx aggregation. ampdu_action existence
+	 * already verified in session init so we add the BUG_ON */
+	BUG_ON(!local->ops->ampdu_action);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Rx BA session stop requested for %s tid %u\n",
+				print_mac(mac, ra), tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_STOP,
+					ra, tid, NULL);
+	if (ret)
+		printk(KERN_DEBUG "HW problem - can not stop rx "
+				"aggregation for tid %d\n", tid);
+
+	/* shutdown timer has not expired */
+	if (initiator != WLAN_BACK_TIMER)
+		del_timer_sync(&sta->ampdu_mlme.tid_rx[tid]->session_timer);
+
+	/* check if this is a self generated aggregation halt */
+	if (initiator == WLAN_BACK_RECIPIENT || initiator == WLAN_BACK_TIMER)
+		ieee80211_send_delba(sdata, ra, tid, 0, reason);
+
+	/* free the reordering buffer */
+	for (i = 0; i < sta->ampdu_mlme.tid_rx[tid]->buf_size; i++) {
+		if (sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]) {
+			/* release the reordered frames */
+			dev_kfree_skb(sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]);
+			sta->ampdu_mlme.tid_rx[tid]->stored_mpdu_num--;
+			sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i] = NULL;
+		}
+	}
+	/* free resources */
+	kfree(sta->ampdu_mlme.tid_rx[tid]->reorder_buf);
+	kfree(sta->ampdu_mlme.tid_rx[tid]);
+	sta->ampdu_mlme.tid_rx[tid] = NULL;
+	sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_IDLE;
+
+	rcu_read_unlock();
+}
+
+
+/*
+ * After sending add Block Ack request we activated a timer until
+ * add Block Ack response will arrive from the recipient.
+ * If this timer expires sta_addba_resp_timer_expired will be executed.
+ */
+void sta_addba_resp_timer_expired(unsigned long data)
+{
+	/* not an elegant detour, but there is no choice as the timer passes
+	 * only one argument, and both sta_info and TID are needed, so init
+	 * flow in sta_info_create gives the TID as data, while the timer_to_id
+	 * array gives the sta through container_of */
+	u16 tid = *(u8 *)data;
+	struct sta_info *temp_sta = container_of((void *)data,
+		struct sta_info, timer_to_tid[tid]);
+
+	struct ieee80211_local *local = temp_sta->local;
+	struct ieee80211_hw *hw = &local->hw;
+	struct sta_info *sta;
+	u8 *state;
+
+	rcu_read_lock();
+
+	sta = sta_info_get(local, temp_sta->addr);
+	if (!sta) {
+		rcu_read_unlock();
+		return;
+	}
+
+	state = &sta->ampdu_mlme.tid_state_tx[tid];
+	/* check if the TID waits for addBA response */
+	spin_lock_bh(&sta->lock);
+	if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
+		spin_unlock_bh(&sta->lock);
+		*state = HT_AGG_STATE_IDLE;
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "timer expired on tid %d but we are not "
+				"expecting addBA response there", tid);
+#endif
+		goto timer_expired_exit;
+	}
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "addBA response timer expired on tid %d\n", tid);
+#endif
+
+	/* go through the state check in stop_BA_session */
+	*state = HT_AGG_STATE_OPERATIONAL;
+	spin_unlock_bh(&sta->lock);
+	ieee80211_stop_tx_ba_session(hw, temp_sta->addr, tid,
+				     WLAN_BACK_INITIATOR);
+
+timer_expired_exit:
+	rcu_read_unlock();
+}
+
+void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr)
+{
+	struct ieee80211_local *local = sdata->local;
+	int i;
+
+	for (i = 0; i <  STA_TID_NUM; i++) {
+		ieee80211_stop_tx_ba_session(&local->hw, addr, i,
+					     WLAN_BACK_INITIATOR);
+		ieee80211_sta_stop_rx_ba_session(sdata, addr, i,
+						 WLAN_BACK_RECIPIENT,
+						 WLAN_REASON_QSTA_LEAVE_QBSS);
+	}
+}
+
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index be3292bfabe..f43ca7b88d5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -48,20 +48,6 @@
 #define IEEE80211_IBSS_MAX_STA_ENTRIES 128
 
 
-/* mgmt header + 1 byte category code */
-#define IEEE80211_MIN_ACTION_SIZE (24 + 1)
-
-#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002
-#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C
-#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFA0
-#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000
-#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800
-
-/* next values represent the buffer size for A-MPDU frame.
- * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */
-#define IEEE80211_MIN_AMPDU_BUF 0x8
-#define IEEE80211_MAX_AMPDU_BUF 0x40
-
 /* utils */
 static int ecw2cw(int ecw)
 {
@@ -389,52 +375,6 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 	return changed;
 }
 
-int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
-				   struct ieee80211_ht_info *ht_info)
-{
-
-	if (ht_info == NULL)
-		return -EINVAL;
-
-	memset(ht_info, 0, sizeof(*ht_info));
-
-	if (ht_cap_ie) {
-		u8 ampdu_info = ht_cap_ie->ampdu_params_info;
-
-		ht_info->ht_supported = 1;
-		ht_info->cap = le16_to_cpu(ht_cap_ie->cap_info);
-		ht_info->ampdu_factor =
-			ampdu_info & IEEE80211_HT_CAP_AMPDU_FACTOR;
-		ht_info->ampdu_density =
-			(ampdu_info & IEEE80211_HT_CAP_AMPDU_DENSITY) >> 2;
-		memcpy(ht_info->supp_mcs_set, ht_cap_ie->supp_mcs_set, 16);
-	} else
-		ht_info->ht_supported = 0;
-
-	return 0;
-}
-
-int ieee80211_ht_addt_info_ie_to_ht_bss_info(
-			struct ieee80211_ht_addt_info *ht_add_info_ie,
-			struct ieee80211_ht_bss_info *bss_info)
-{
-	if (bss_info == NULL)
-		return -EINVAL;
-
-	memset(bss_info, 0, sizeof(*bss_info));
-
-	if (ht_add_info_ie) {
-		u16 op_mode;
-		op_mode = le16_to_cpu(ht_add_info_ie->operation_mode);
-
-		bss_info->primary_channel = ht_add_info_ie->control_chan;
-		bss_info->bss_cap = ht_add_info_ie->ht_param;
-		bss_info->bss_op_mode = (u8)(op_mode & 0xff);
-	}
-
-	return 0;
-}
-
 static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata,
 					struct ieee80211_if_sta *ifsta)
 {
@@ -1118,55 +1058,6 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
 	return;
 }
 
-void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da,
-				u16 tid, u8 dialog_token, u16 start_seq_num,
-				u16 agg_size, u16 timeout)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u16 capab;
-
-	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
-
-	if (!skb) {
-		printk(KERN_ERR "%s: failed to allocate buffer "
-				"for addba request frame\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
-		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
-	else
-		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_ACTION);
-
-	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req));
-
-	mgmt->u.action.category = WLAN_CATEGORY_BACK;
-	mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ;
-
-	mgmt->u.action.u.addba_req.dialog_token = dialog_token;
-	capab = (u16)(1 << 1);		/* bit 1 aggregation policy */
-	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
-	capab |= (u16)(agg_size << 6);	/* bit 15:6 max size of aggergation */
-
-	mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab);
-
-	mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout);
-	mgmt->u.action.u.addba_req.start_seq_num =
-					cpu_to_le16(start_seq_num << 4);
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
 /*
  * After accepting the AddBA Request we activated a timer,
  * resetting it after each frame that arrives from the originator.
@@ -1396,149 +1287,6 @@ addba_resp_exit:
 	rcu_read_unlock();
 }
 
-void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid,
-			  u16 initiator, u16 reason_code)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u16 params;
-
-	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
-
-	if (!skb) {
-		printk(KERN_ERR "%s: failed to allocate buffer "
-					"for delba frame\n", sdata->dev->name);
-		return;
-	}
-
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
-		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
-	else
-		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_ACTION);
-
-	skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba));
-
-	mgmt->u.action.category = WLAN_CATEGORY_BACK;
-	mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA;
-	params = (u16)(initiator << 11); 	/* bit 11 initiator */
-	params |= (u16)(tid << 12); 		/* bit 15:12 TID number */
-
-	mgmt->u.action.u.delba.params = cpu_to_le16(params);
-	mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code);
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
-void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_bar *bar;
-	u16 bar_control = 0;
-
-	skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom);
-	if (!skb) {
-		printk(KERN_ERR "%s: failed to allocate buffer for "
-			"bar frame\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-	bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar));
-	memset(bar, 0, sizeof(*bar));
-	bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
-					 IEEE80211_STYPE_BACK_REQ);
-	memcpy(bar->ra, ra, ETH_ALEN);
-	memcpy(bar->ta, sdata->dev->dev_addr, ETH_ALEN);
-	bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL;
-	bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA;
-	bar_control |= (u16)(tid << 12);
-	bar->control = cpu_to_le16(bar_control);
-	bar->start_seq_num = cpu_to_le16(ssn);
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
-void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid,
-					u16 initiator, u16 reason)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_hw *hw = &local->hw;
-	struct sta_info *sta;
-	int ret, i;
-	DECLARE_MAC_BUF(mac);
-
-	rcu_read_lock();
-
-	sta = sta_info_get(local, ra);
-	if (!sta) {
-		rcu_read_unlock();
-		return;
-	}
-
-	/* check if TID is in operational state */
-	spin_lock_bh(&sta->lock);
-	if (sta->ampdu_mlme.tid_state_rx[tid]
-				!= HT_AGG_STATE_OPERATIONAL) {
-		spin_unlock_bh(&sta->lock);
-		rcu_read_unlock();
-		return;
-	}
-	sta->ampdu_mlme.tid_state_rx[tid] =
-		HT_AGG_STATE_REQ_STOP_BA_MSK |
-		(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
-	spin_unlock_bh(&sta->lock);
-
-	/* stop HW Rx aggregation. ampdu_action existence
-	 * already verified in session init so we add the BUG_ON */
-	BUG_ON(!local->ops->ampdu_action);
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "Rx BA session stop requested for %s tid %u\n",
-				print_mac(mac, ra), tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-
-	ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_STOP,
-					ra, tid, NULL);
-	if (ret)
-		printk(KERN_DEBUG "HW problem - can not stop rx "
-				"aggregation for tid %d\n", tid);
-
-	/* shutdown timer has not expired */
-	if (initiator != WLAN_BACK_TIMER)
-		del_timer_sync(&sta->ampdu_mlme.tid_rx[tid]->session_timer);
-
-	/* check if this is a self generated aggregation halt */
-	if (initiator == WLAN_BACK_RECIPIENT || initiator == WLAN_BACK_TIMER)
-		ieee80211_send_delba(sdata, ra, tid, 0, reason);
-
-	/* free the reordering buffer */
-	for (i = 0; i < sta->ampdu_mlme.tid_rx[tid]->buf_size; i++) {
-		if (sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]) {
-			/* release the reordered frames */
-			dev_kfree_skb(sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]);
-			sta->ampdu_mlme.tid_rx[tid]->stored_mpdu_num--;
-			sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i] = NULL;
-		}
-	}
-	/* free resources */
-	kfree(sta->ampdu_mlme.tid_rx[tid]->reorder_buf);
-	kfree(sta->ampdu_mlme.tid_rx[tid]);
-	sta->ampdu_mlme.tid_rx[tid] = NULL;
-	sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_IDLE;
-
-	rcu_read_unlock();
-}
-
-
 static void ieee80211_sta_process_delba(struct ieee80211_sub_if_data *sdata,
 			struct ieee80211_mgmt *mgmt, size_t len)
 {
@@ -1582,75 +1330,6 @@ static void ieee80211_sta_process_delba(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 }
 
-/*
- * After sending add Block Ack request we activated a timer until
- * add Block Ack response will arrive from the recipient.
- * If this timer expires sta_addba_resp_timer_expired will be executed.
- */
-void sta_addba_resp_timer_expired(unsigned long data)
-{
-	/* not an elegant detour, but there is no choice as the timer passes
-	 * only one argument, and both sta_info and TID are needed, so init
-	 * flow in sta_info_create gives the TID as data, while the timer_to_id
-	 * array gives the sta through container_of */
-	u16 tid = *(u8 *)data;
-	struct sta_info *temp_sta = container_of((void *)data,
-		struct sta_info, timer_to_tid[tid]);
-
-	struct ieee80211_local *local = temp_sta->local;
-	struct ieee80211_hw *hw = &local->hw;
-	struct sta_info *sta;
-	u8 *state;
-
-	rcu_read_lock();
-
-	sta = sta_info_get(local, temp_sta->addr);
-	if (!sta) {
-		rcu_read_unlock();
-		return;
-	}
-
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-	/* check if the TID waits for addBA response */
-	spin_lock_bh(&sta->lock);
-	if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
-		spin_unlock_bh(&sta->lock);
-		*state = HT_AGG_STATE_IDLE;
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "timer expired on tid %d but we are not "
-				"expecting addBA response there", tid);
-#endif
-		goto timer_expired_exit;
-	}
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "addBA response timer expired on tid %d\n", tid);
-#endif
-
-	/* go through the state check in stop_BA_session */
-	*state = HT_AGG_STATE_OPERATIONAL;
-	spin_unlock_bh(&sta->lock);
-	ieee80211_stop_tx_ba_session(hw, temp_sta->addr, tid,
-				     WLAN_BACK_INITIATOR);
-
-timer_expired_exit:
-	rcu_read_unlock();
-}
-
-void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr)
-{
-	struct ieee80211_local *local = sdata->local;
-	int i;
-
-	for (i = 0; i <  STA_TID_NUM; i++) {
-		ieee80211_stop_tx_ba_session(&local->hw, addr, i,
-					     WLAN_BACK_INITIATOR);
-		ieee80211_sta_stop_rx_ba_session(sdata, addr, i,
-						 WLAN_BACK_RECIPIENT,
-						 WLAN_REASON_QSTA_LEAVE_QBSS);
-	}
-}
-
 static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
 					struct ieee80211_msrment_ie *request_ie,
 					const u8 *da, const u8 *bssid,
-- 
cgit v1.2.3


From bacac545f10f2bf6e5ceff0d8e2b82dfc493602a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:44:29 +0200
Subject: mac80211: move some HT code out of main.c

Now that I've created ht.c, I can move the aggregation
code from main.c into it.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ht.c   | 377 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/mac80211/main.c | 373 ---------------------------------------------------
 2 files changed, 375 insertions(+), 375 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 5ccf1bc1746..c72b3fe3ccd 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -2,8 +2,8 @@
  * HT handling
  *
  * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
- * Copyright 2004, Instant802 Networks, Inc.
- * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2008, Intel Corporation
@@ -18,6 +18,7 @@
 #include <net/mac80211.h>
 #include "ieee80211_i.h"
 #include "sta_info.h"
+#include "wme.h"
 
 int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
 				   struct ieee80211_ht_info *ht_info)
@@ -326,3 +327,375 @@ void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8
 	}
 }
 
+int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct sta_info *sta;
+	struct ieee80211_sub_if_data *sdata;
+	u16 start_seq_num;
+	u8 *state;
+	int ret;
+	DECLARE_MAC_BUF(mac);
+
+	if (tid >= STA_TID_NUM)
+		return -EINVAL;
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Open BA session requested for %s tid %u\n",
+				print_mac(mac, ra), tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	rcu_read_lock();
+
+	sta = sta_info_get(local, ra);
+	if (!sta) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Could not find the station\n");
+#endif
+		ret = -ENOENT;
+		goto exit;
+	}
+
+	spin_lock_bh(&sta->lock);
+
+	/* we have tried too many times, receiver does not want A-MPDU */
+	if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
+		ret = -EBUSY;
+		goto err_unlock_sta;
+	}
+
+	state = &sta->ampdu_mlme.tid_state_tx[tid];
+	/* check if the TID is not in aggregation flow already */
+	if (*state != HT_AGG_STATE_IDLE) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "BA request denied - session is not "
+				 "idle on tid %u\n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		ret = -EAGAIN;
+		goto err_unlock_sta;
+	}
+
+	/* prepare A-MPDU MLME for Tx aggregation */
+	sta->ampdu_mlme.tid_tx[tid] =
+			kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
+	if (!sta->ampdu_mlme.tid_tx[tid]) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_ERR "allocate tx mlme to tid %d failed\n",
+					tid);
+#endif
+		ret = -ENOMEM;
+		goto err_unlock_sta;
+	}
+	/* Tx timer */
+	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function =
+			sta_addba_resp_timer_expired;
+	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.data =
+			(unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
+
+	/* create a new queue for this aggregation */
+	ret = ieee80211_ht_agg_queue_add(local, sta, tid);
+
+	/* case no queue is available to aggregation
+	 * don't switch to aggregation */
+	if (ret) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "BA request denied - queue unavailable for"
+					" tid %d\n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		goto err_unlock_queue;
+	}
+	sdata = sta->sdata;
+
+	/* Ok, the Addba frame hasn't been sent yet, but if the driver calls the
+	 * call back right away, it must see that the flow has begun */
+	*state |= HT_ADDBA_REQUESTED_MSK;
+
+	/* This is slightly racy because the queue isn't stopped */
+	start_seq_num = sta->tid_seq[tid];
+
+	if (local->ops->ampdu_action)
+		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START,
+						ra, tid, &start_seq_num);
+
+	if (ret) {
+		/* No need to requeue the packets in the agg queue, since we
+		 * held the tx lock: no packet could be enqueued to the newly
+		 * allocated queue */
+		ieee80211_ht_agg_queue_remove(local, sta, tid, 0);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "BA request denied - HW unavailable for"
+					" tid %d\n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		*state = HT_AGG_STATE_IDLE;
+		goto err_unlock_queue;
+	}
+
+	/* Will put all the packets in the new SW queue */
+	ieee80211_requeue(local, ieee802_1d_to_ac[tid]);
+	spin_unlock_bh(&sta->lock);
+
+	/* send an addBA request */
+	sta->ampdu_mlme.dialog_token_allocator++;
+	sta->ampdu_mlme.tid_tx[tid]->dialog_token =
+			sta->ampdu_mlme.dialog_token_allocator;
+	sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
+
+
+	ieee80211_send_addba_request(sta->sdata, ra, tid,
+			 sta->ampdu_mlme.tid_tx[tid]->dialog_token,
+			 sta->ampdu_mlme.tid_tx[tid]->ssn,
+			 0x40, 5000);
+	/* activate the timer for the recipient's addBA response */
+	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.expires =
+				jiffies + ADDBA_RESP_INTERVAL;
+	add_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
+#endif
+	goto exit;
+
+err_unlock_queue:
+	kfree(sta->ampdu_mlme.tid_tx[tid]);
+	sta->ampdu_mlme.tid_tx[tid] = NULL;
+	ret = -EBUSY;
+err_unlock_sta:
+	spin_unlock_bh(&sta->lock);
+exit:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(ieee80211_start_tx_ba_session);
+
+int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw,
+				 u8 *ra, u16 tid,
+				 enum ieee80211_back_parties initiator)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct sta_info *sta;
+	u8 *state;
+	int ret = 0;
+	DECLARE_MAC_BUF(mac);
+
+	if (tid >= STA_TID_NUM)
+		return -EINVAL;
+
+	rcu_read_lock();
+	sta = sta_info_get(local, ra);
+	if (!sta) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	/* check if the TID is in aggregation */
+	state = &sta->ampdu_mlme.tid_state_tx[tid];
+	spin_lock_bh(&sta->lock);
+
+	if (*state != HT_AGG_STATE_OPERATIONAL) {
+		ret = -ENOENT;
+		goto stop_BA_exit;
+	}
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Tx BA session stop requested for %s tid %u\n",
+				print_mac(mac, ra), tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	ieee80211_stop_queue(hw, sta->tid_to_tx_q[tid]);
+
+	*state = HT_AGG_STATE_REQ_STOP_BA_MSK |
+		(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
+
+	if (local->ops->ampdu_action)
+		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_STOP,
+						ra, tid, NULL);
+
+	/* case HW denied going back to legacy */
+	if (ret) {
+		WARN_ON(ret != -EBUSY);
+		*state = HT_AGG_STATE_OPERATIONAL;
+		ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+		goto stop_BA_exit;
+	}
+
+stop_BA_exit:
+	spin_unlock_bh(&sta->lock);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(ieee80211_stop_tx_ba_session);
+
+void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct sta_info *sta;
+	u8 *state;
+	DECLARE_MAC_BUF(mac);
+
+	if (tid >= STA_TID_NUM) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n",
+				tid, STA_TID_NUM);
+#endif
+		return;
+	}
+
+	rcu_read_lock();
+	sta = sta_info_get(local, ra);
+	if (!sta) {
+		rcu_read_unlock();
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Could not find station: %s\n",
+				print_mac(mac, ra));
+#endif
+		return;
+	}
+
+	state = &sta->ampdu_mlme.tid_state_tx[tid];
+	spin_lock_bh(&sta->lock);
+
+	if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "addBA was not requested yet, state is %d\n",
+				*state);
+#endif
+		spin_unlock_bh(&sta->lock);
+		rcu_read_unlock();
+		return;
+	}
+
+	WARN_ON_ONCE(*state & HT_ADDBA_DRV_READY_MSK);
+
+	*state |= HT_ADDBA_DRV_READY_MSK;
+
+	if (*state == HT_AGG_STATE_OPERATIONAL) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid);
+#endif
+		ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+	}
+	spin_unlock_bh(&sta->lock);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_start_tx_ba_cb);
+
+void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct sta_info *sta;
+	u8 *state;
+	int agg_queue;
+	DECLARE_MAC_BUF(mac);
+
+	if (tid >= STA_TID_NUM) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n",
+				tid, STA_TID_NUM);
+#endif
+		return;
+	}
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Stopping Tx BA session for %s tid %d\n",
+				print_mac(mac, ra), tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	rcu_read_lock();
+	sta = sta_info_get(local, ra);
+	if (!sta) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Could not find station: %s\n",
+				print_mac(mac, ra));
+#endif
+		rcu_read_unlock();
+		return;
+	}
+	state = &sta->ampdu_mlme.tid_state_tx[tid];
+
+	/* NOTE: no need to use sta->lock in this state check, as
+	 * ieee80211_stop_tx_ba_session will let only one stop call to
+	 * pass through per sta/tid
+	 */
+	if ((*state & HT_AGG_STATE_REQ_STOP_BA_MSK) == 0) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "unexpected callback to A-MPDU stop\n");
+#endif
+		rcu_read_unlock();
+		return;
+	}
+
+	if (*state & HT_AGG_STATE_INITIATOR_MSK)
+		ieee80211_send_delba(sta->sdata, ra, tid,
+			WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
+
+	agg_queue = sta->tid_to_tx_q[tid];
+
+	ieee80211_ht_agg_queue_remove(local, sta, tid, 1);
+
+	/* We just requeued the all the frames that were in the
+	 * removed queue, and since we might miss a softirq we do
+	 * netif_schedule_queue.  ieee80211_wake_queue is not used
+	 * here as this queue is not necessarily stopped
+	 */
+	netif_schedule_queue(netdev_get_tx_queue(local->mdev, agg_queue));
+	spin_lock_bh(&sta->lock);
+	*state = HT_AGG_STATE_IDLE;
+	sta->ampdu_mlme.addba_req_num[tid] = 0;
+	kfree(sta->ampdu_mlme.tid_tx[tid]);
+	sta->ampdu_mlme.tid_tx[tid] = NULL;
+	spin_unlock_bh(&sta->lock);
+
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb);
+
+void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_hw *hw,
+				      const u8 *ra, u16 tid)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_ra_tid *ra_tid;
+	struct sk_buff *skb = dev_alloc_skb(0);
+
+	if (unlikely(!skb)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_WARNING "%s: Not enough memory, "
+			       "dropping start BA session", skb->dev->name);
+#endif
+		return;
+	}
+	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
+	memcpy(&ra_tid->ra, ra, ETH_ALEN);
+	ra_tid->tid = tid;
+
+	skb->pkt_type = IEEE80211_ADDBA_MSG;
+	skb_queue_tail(&local->skb_queue, skb);
+	tasklet_schedule(&local->tasklet);
+}
+EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
+
+void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_hw *hw,
+				     const u8 *ra, u16 tid)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_ra_tid *ra_tid;
+	struct sk_buff *skb = dev_alloc_skb(0);
+
+	if (unlikely(!skb)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_WARNING "%s: Not enough memory, "
+			       "dropping stop BA session", skb->dev->name);
+#endif
+		return;
+	}
+	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
+	memcpy(&ra_tid->ra, ra, ETH_ALEN);
+	ra_tid->tid = tid;
+
+	skb->pkt_type = IEEE80211_DELBA_MSG;
+	skb_queue_tail(&local->skb_queue, skb);
+	tasklet_schedule(&local->tasklet);
+}
+EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 6df4a2e1509..f90254a5948 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -593,379 +593,6 @@ static int ieee80211_stop(struct net_device *dev)
 	return 0;
 }
 
-int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct sta_info *sta;
-	struct ieee80211_sub_if_data *sdata;
-	u16 start_seq_num;
-	u8 *state;
-	int ret;
-	DECLARE_MAC_BUF(mac);
-
-	if (tid >= STA_TID_NUM)
-		return -EINVAL;
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "Open BA session requested for %s tid %u\n",
-				print_mac(mac, ra), tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-
-	rcu_read_lock();
-
-	sta = sta_info_get(local, ra);
-	if (!sta) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Could not find the station\n");
-#endif
-		ret = -ENOENT;
-		goto exit;
-	}
-
-	spin_lock_bh(&sta->lock);
-
-	/* we have tried too many times, receiver does not want A-MPDU */
-	if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
-		ret = -EBUSY;
-		goto err_unlock_sta;
-	}
-
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-	/* check if the TID is not in aggregation flow already */
-	if (*state != HT_AGG_STATE_IDLE) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "BA request denied - session is not "
-				 "idle on tid %u\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-		ret = -EAGAIN;
-		goto err_unlock_sta;
-	}
-
-	/* prepare A-MPDU MLME for Tx aggregation */
-	sta->ampdu_mlme.tid_tx[tid] =
-			kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
-	if (!sta->ampdu_mlme.tid_tx[tid]) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		if (net_ratelimit())
-			printk(KERN_ERR "allocate tx mlme to tid %d failed\n",
-					tid);
-#endif
-		ret = -ENOMEM;
-		goto err_unlock_sta;
-	}
-	/* Tx timer */
-	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function =
-			sta_addba_resp_timer_expired;
-	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.data =
-			(unsigned long)&sta->timer_to_tid[tid];
-	init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
-
-	/* create a new queue for this aggregation */
-	ret = ieee80211_ht_agg_queue_add(local, sta, tid);
-
-	/* case no queue is available to aggregation
-	 * don't switch to aggregation */
-	if (ret) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "BA request denied - queue unavailable for"
-					" tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-		goto err_unlock_queue;
-	}
-	sdata = sta->sdata;
-
-	/* Ok, the Addba frame hasn't been sent yet, but if the driver calls the
-	 * call back right away, it must see that the flow has begun */
-	*state |= HT_ADDBA_REQUESTED_MSK;
-
-	/* This is slightly racy because the queue isn't stopped */
-	start_seq_num = sta->tid_seq[tid];
-
-	if (local->ops->ampdu_action)
-		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START,
-						ra, tid, &start_seq_num);
-
-	if (ret) {
-		/* No need to requeue the packets in the agg queue, since we
-		 * held the tx lock: no packet could be enqueued to the newly
-		 * allocated queue */
-		ieee80211_ht_agg_queue_remove(local, sta, tid, 0);
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "BA request denied - HW unavailable for"
-					" tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-		*state = HT_AGG_STATE_IDLE;
-		goto err_unlock_queue;
-	}
-
-	/* Will put all the packets in the new SW queue */
-	ieee80211_requeue(local, ieee802_1d_to_ac[tid]);
-	spin_unlock_bh(&sta->lock);
-
-	/* send an addBA request */
-	sta->ampdu_mlme.dialog_token_allocator++;
-	sta->ampdu_mlme.tid_tx[tid]->dialog_token =
-			sta->ampdu_mlme.dialog_token_allocator;
-	sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
-
-
-	ieee80211_send_addba_request(sta->sdata, ra, tid,
-			 sta->ampdu_mlme.tid_tx[tid]->dialog_token,
-			 sta->ampdu_mlme.tid_tx[tid]->ssn,
-			 0x40, 5000);
-	/* activate the timer for the recipient's addBA response */
-	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.expires =
-				jiffies + ADDBA_RESP_INTERVAL;
-	add_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
-#endif
-	goto exit;
-
-err_unlock_queue:
-	kfree(sta->ampdu_mlme.tid_tx[tid]);
-	sta->ampdu_mlme.tid_tx[tid] = NULL;
-	ret = -EBUSY;
-err_unlock_sta:
-	spin_unlock_bh(&sta->lock);
-exit:
-	rcu_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL(ieee80211_start_tx_ba_session);
-
-int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw,
-				 u8 *ra, u16 tid,
-				 enum ieee80211_back_parties initiator)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct sta_info *sta;
-	u8 *state;
-	int ret = 0;
-	DECLARE_MAC_BUF(mac);
-
-	if (tid >= STA_TID_NUM)
-		return -EINVAL;
-
-	rcu_read_lock();
-	sta = sta_info_get(local, ra);
-	if (!sta) {
-		rcu_read_unlock();
-		return -ENOENT;
-	}
-
-	/* check if the TID is in aggregation */
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-	spin_lock_bh(&sta->lock);
-
-	if (*state != HT_AGG_STATE_OPERATIONAL) {
-		ret = -ENOENT;
-		goto stop_BA_exit;
-	}
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "Tx BA session stop requested for %s tid %u\n",
-				print_mac(mac, ra), tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-
-	ieee80211_stop_queue(hw, sta->tid_to_tx_q[tid]);
-
-	*state = HT_AGG_STATE_REQ_STOP_BA_MSK |
-		(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
-
-	if (local->ops->ampdu_action)
-		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_STOP,
-						ra, tid, NULL);
-
-	/* case HW denied going back to legacy */
-	if (ret) {
-		WARN_ON(ret != -EBUSY);
-		*state = HT_AGG_STATE_OPERATIONAL;
-		ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
-		goto stop_BA_exit;
-	}
-
-stop_BA_exit:
-	spin_unlock_bh(&sta->lock);
-	rcu_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL(ieee80211_stop_tx_ba_session);
-
-void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct sta_info *sta;
-	u8 *state;
-	DECLARE_MAC_BUF(mac);
-
-	if (tid >= STA_TID_NUM) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n",
-				tid, STA_TID_NUM);
-#endif
-		return;
-	}
-
-	rcu_read_lock();
-	sta = sta_info_get(local, ra);
-	if (!sta) {
-		rcu_read_unlock();
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Could not find station: %s\n",
-				print_mac(mac, ra));
-#endif
-		return;
-	}
-
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-	spin_lock_bh(&sta->lock);
-
-	if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "addBA was not requested yet, state is %d\n",
-				*state);
-#endif
-		spin_unlock_bh(&sta->lock);
-		rcu_read_unlock();
-		return;
-	}
-
-	WARN_ON_ONCE(*state & HT_ADDBA_DRV_READY_MSK);
-
-	*state |= HT_ADDBA_DRV_READY_MSK;
-
-	if (*state == HT_AGG_STATE_OPERATIONAL) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid);
-#endif
-		ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
-	}
-	spin_unlock_bh(&sta->lock);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL(ieee80211_start_tx_ba_cb);
-
-void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct sta_info *sta;
-	u8 *state;
-	int agg_queue;
-	DECLARE_MAC_BUF(mac);
-
-	if (tid >= STA_TID_NUM) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n",
-				tid, STA_TID_NUM);
-#endif
-		return;
-	}
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "Stopping Tx BA session for %s tid %d\n",
-				print_mac(mac, ra), tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-
-	rcu_read_lock();
-	sta = sta_info_get(local, ra);
-	if (!sta) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Could not find station: %s\n",
-				print_mac(mac, ra));
-#endif
-		rcu_read_unlock();
-		return;
-	}
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-
-	/* NOTE: no need to use sta->lock in this state check, as
-	 * ieee80211_stop_tx_ba_session will let only one stop call to
-	 * pass through per sta/tid
-	 */
-	if ((*state & HT_AGG_STATE_REQ_STOP_BA_MSK) == 0) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "unexpected callback to A-MPDU stop\n");
-#endif
-		rcu_read_unlock();
-		return;
-	}
-
-	if (*state & HT_AGG_STATE_INITIATOR_MSK)
-		ieee80211_send_delba(sta->sdata, ra, tid,
-			WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
-
-	agg_queue = sta->tid_to_tx_q[tid];
-
-	ieee80211_ht_agg_queue_remove(local, sta, tid, 1);
-
-	/* We just requeued the all the frames that were in the
-	 * removed queue, and since we might miss a softirq we do
-	 * netif_schedule_queue.  ieee80211_wake_queue is not used
-	 * here as this queue is not necessarily stopped
-	 */
-	netif_schedule_queue(netdev_get_tx_queue(local->mdev, agg_queue));
-	spin_lock_bh(&sta->lock);
-	*state = HT_AGG_STATE_IDLE;
-	sta->ampdu_mlme.addba_req_num[tid] = 0;
-	kfree(sta->ampdu_mlme.tid_tx[tid]);
-	sta->ampdu_mlme.tid_tx[tid] = NULL;
-	spin_unlock_bh(&sta->lock);
-
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb);
-
-void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_hw *hw,
-				      const u8 *ra, u16 tid)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct ieee80211_ra_tid *ra_tid;
-	struct sk_buff *skb = dev_alloc_skb(0);
-
-	if (unlikely(!skb)) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		if (net_ratelimit())
-			printk(KERN_WARNING "%s: Not enough memory, "
-			       "dropping start BA session", skb->dev->name);
-#endif
-		return;
-	}
-	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
-	memcpy(&ra_tid->ra, ra, ETH_ALEN);
-	ra_tid->tid = tid;
-
-	skb->pkt_type = IEEE80211_ADDBA_MSG;
-	skb_queue_tail(&local->skb_queue, skb);
-	tasklet_schedule(&local->tasklet);
-}
-EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
-
-void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_hw *hw,
-				     const u8 *ra, u16 tid)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct ieee80211_ra_tid *ra_tid;
-	struct sk_buff *skb = dev_alloc_skb(0);
-
-	if (unlikely(!skb)) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		if (net_ratelimit())
-			printk(KERN_WARNING "%s: Not enough memory, "
-			       "dropping stop BA session", skb->dev->name);
-#endif
-		return;
-	}
-	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
-	memcpy(&ra_tid->ra, ra, ETH_ALEN);
-	ra_tid->tid = tid;
-
-	skb->pkt_type = IEEE80211_DELBA_MSG;
-	skb_queue_tail(&local->skb_queue, skb);
-	tasklet_schedule(&local->tasklet);
-}
-EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe);
-
 static void ieee80211_set_multicast_list(struct net_device *dev)
 {
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-- 
cgit v1.2.3


From 9116dd01120e249dc2e84e6edecd7ad7f828680f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:47:23 +0200
Subject: mac80211: clarify scan request

When a scan is requested for non-STA interfaces, we simply fire
off a scan, but for STA interfaces we shouldn't because they
could be in the middle of an association. This clarifies the
corresponding code.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/scan.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 1beefb5ad6f..9f612015012 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -674,23 +674,32 @@ int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 
 int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_sta *ifsta;
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
 		return ieee80211_sta_start_scan(sdata, ssid, ssid_len);
 
+	/*
+	 * STA has a state machine that might need to defer scanning
+	 * while it's trying to associate/authenticate, therefore we
+	 * queue it up to the state machine in that case.
+	 */
+
 	if (local->sta_sw_scanning || local->sta_hw_scanning) {
 		if (local->scan_sdata == sdata)
 			return 0;
 		return -EBUSY;
 	}
 
+	ifsta = &sdata->u.sta;
+
 	ifsta->scan_ssid_len = ssid_len;
 	if (ssid_len)
 		memcpy(ifsta->scan_ssid, ssid, ssid_len);
 	set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request);
 	queue_work(local->hw.workqueue, &ifsta->work);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From a0fe8b3349bdee27065b57cdceb2ca53c1487866 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 8 Sep 2008 17:58:23 +0200
Subject: mac80211: simplify scan start

ieee80211_sta_start_scan() can very well take a non-NULL
ssid pointer with a zero ssid_len.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f43ca7b88d5..f4cbe5cc56c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3052,10 +3052,7 @@ void ieee80211_sta_work(struct work_struct *work)
 	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
 	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
 	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
-		if (ifsta->scan_ssid_len)
-			ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
-		else
-			ieee80211_sta_start_scan(sdata, NULL, 0);
+		ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
 		return;
 	}
 
-- 
cgit v1.2.3


From b079ada7dd11cf82c3157a51c205c3d88321c704 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 09:32:59 +0200
Subject: mac80211: remove useless 'ibss' parameter

Ever since we refactored beaconing to not be controlled by a
fake queue this parameter to ieee80211_sta_def_wmm_params
has been unused.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f4cbe5cc56c..b1815c1ee6d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -188,8 +188,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 
 /* MLME */
 static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
-					 struct ieee80211_sta_bss *bss,
-					 int ibss)
+					 struct ieee80211_sta_bss *bss)
 {
 	struct ieee80211_local *local = sdata->local;
 	int i, have_higher_than_11mbit = 0;
@@ -1849,7 +1848,7 @@ static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	}
 	ifsta->supp_rates_bits[local->hw.conf.channel->band] = rates;
 
-	ieee80211_sta_def_wmm_params(sdata, bss, 1);
+	ieee80211_sta_def_wmm_params(sdata, bss);
 
 	ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED;
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
@@ -2932,7 +2931,7 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 			ieee80211_sta_set_ssid(sdata, selected->ssid,
 					       selected->ssid_len);
 		ieee80211_sta_set_bssid(sdata, selected->bssid);
-		ieee80211_sta_def_wmm_params(sdata, selected, 0);
+		ieee80211_sta_def_wmm_params(sdata, selected);
 
 		/* Send out direct probe if no probe resp was received or
 		 * the one we have is outdated
-- 
cgit v1.2.3


From 9ac19a9084001695479a6d6dd67443cc5fb1df2f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 10:57:09 +0200
Subject: mac80211: reorder frame code in mlme

This reorders all frame sending functions to be at the top of the
file. When reading the file, I tend to be looking at either the
frame code or the state machine, and having them mixed in the file
is confusing. When all frame sending is at the top the remainder
of the file is more readable, in my opinion.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 884 ++++++++++++++++++++++++++--------------------------
 1 file changed, 441 insertions(+), 443 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b1815c1ee6d..e917e1b7263 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -74,6 +74,27 @@ static u8 *ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
 	return NULL;
 }
 
+static int ieee80211_compatible_rates(struct ieee80211_sta_bss *bss,
+				      struct ieee80211_supported_band *sband,
+				      u64 *rates)
+{
+	int i, j, count;
+	*rates = 0;
+	count = 0;
+	for (i = 0; i < bss->supp_rates_len; i++) {
+		int rate = (bss->supp_rates[i] & 0x7F) * 5;
+
+		for (j = 0; j < sband->n_bitrates; j++)
+			if (sband->bitrates[j].bitrate == rate) {
+				*rates |= BIT(j);
+				count++;
+				break;
+			}
+	}
+
+	return count;
+}
+
 /* frame sending functions */
 void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt)
@@ -186,6 +207,364 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
+static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
+				 struct ieee80211_if_sta *ifsta)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos, *ies, *ht_add_ie;
+	int i, len, count, rates_len, supp_rates_len;
+	u16 capab;
+	struct ieee80211_sta_bss *bss;
+	int wmm = 0;
+	struct ieee80211_supported_band *sband;
+	u64 rates = 0;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(*mgmt) + 200 + ifsta->extra_ie_len +
+			    ifsta->ssid_len);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	capab = ifsta->capab;
+
+	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) {
+		if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
+			capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
+		if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE))
+			capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
+	}
+
+	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+				   local->hw.conf.channel->center_freq,
+				   ifsta->ssid, ifsta->ssid_len);
+	if (bss) {
+		if (bss->capability & WLAN_CAPABILITY_PRIVACY)
+			capab |= WLAN_CAPABILITY_PRIVACY;
+		if (bss->wmm_used)
+			wmm = 1;
+
+		/* get all rates supported by the device and the AP as
+		 * some APs don't like getting a superset of their rates
+		 * in the association request (e.g. D-Link DAP 1353 in
+		 * b-only mode) */
+		rates_len = ieee80211_compatible_rates(bss, sband, &rates);
+
+		if ((bss->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) &&
+		    (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT))
+			capab |= WLAN_CAPABILITY_SPECTRUM_MGMT;
+
+		ieee80211_rx_bss_put(local, bss);
+	} else {
+		rates = ~0;
+		rates_len = sband->n_bitrates;
+	}
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+
+	if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
+		skb_put(skb, 10);
+		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_REASSOC_REQ);
+		mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
+		mgmt->u.reassoc_req.listen_interval =
+				cpu_to_le16(local->hw.conf.listen_interval);
+		memcpy(mgmt->u.reassoc_req.current_ap, ifsta->prev_bssid,
+		       ETH_ALEN);
+	} else {
+		skb_put(skb, 4);
+		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_ASSOC_REQ);
+		mgmt->u.assoc_req.capab_info = cpu_to_le16(capab);
+		mgmt->u.reassoc_req.listen_interval =
+				cpu_to_le16(local->hw.conf.listen_interval);
+	}
+
+	/* SSID */
+	ies = pos = skb_put(skb, 2 + ifsta->ssid_len);
+	*pos++ = WLAN_EID_SSID;
+	*pos++ = ifsta->ssid_len;
+	memcpy(pos, ifsta->ssid, ifsta->ssid_len);
+
+	/* add all rates which were marked to be used above */
+	supp_rates_len = rates_len;
+	if (supp_rates_len > 8)
+		supp_rates_len = 8;
+
+	len = sband->n_bitrates;
+	pos = skb_put(skb, supp_rates_len + 2);
+	*pos++ = WLAN_EID_SUPP_RATES;
+	*pos++ = supp_rates_len;
+
+	count = 0;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if (BIT(i) & rates) {
+			int rate = sband->bitrates[i].bitrate;
+			*pos++ = (u8) (rate / 5);
+			if (++count == 8)
+				break;
+		}
+	}
+
+	if (rates_len > count) {
+		pos = skb_put(skb, rates_len - count + 2);
+		*pos++ = WLAN_EID_EXT_SUPP_RATES;
+		*pos++ = rates_len - count;
+
+		for (i++; i < sband->n_bitrates; i++) {
+			if (BIT(i) & rates) {
+				int rate = sband->bitrates[i].bitrate;
+				*pos++ = (u8) (rate / 5);
+			}
+		}
+	}
+
+	if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) {
+		/* 1. power capabilities */
+		pos = skb_put(skb, 4);
+		*pos++ = WLAN_EID_PWR_CAPABILITY;
+		*pos++ = 2;
+		*pos++ = 0; /* min tx power */
+		*pos++ = local->hw.conf.channel->max_power; /* max tx power */
+
+		/* 2. supported channels */
+		/* TODO: get this in reg domain format */
+		pos = skb_put(skb, 2 * sband->n_channels + 2);
+		*pos++ = WLAN_EID_SUPPORTED_CHANNELS;
+		*pos++ = 2 * sband->n_channels;
+		for (i = 0; i < sband->n_channels; i++) {
+			*pos++ = ieee80211_frequency_to_channel(
+					sband->channels[i].center_freq);
+			*pos++ = 1; /* one channel in the subband*/
+		}
+	}
+
+	if (ifsta->extra_ie) {
+		pos = skb_put(skb, ifsta->extra_ie_len);
+		memcpy(pos, ifsta->extra_ie, ifsta->extra_ie_len);
+	}
+
+	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
+		pos = skb_put(skb, 9);
+		*pos++ = WLAN_EID_VENDOR_SPECIFIC;
+		*pos++ = 7; /* len */
+		*pos++ = 0x00; /* Microsoft OUI 00:50:F2 */
+		*pos++ = 0x50;
+		*pos++ = 0xf2;
+		*pos++ = 2; /* WME */
+		*pos++ = 0; /* WME info */
+		*pos++ = 1; /* WME ver */
+		*pos++ = 0;
+	}
+
+	/* wmm support is a must to HT */
+	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) &&
+	    sband->ht_info.ht_supported &&
+	    (ht_add_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_EXTRA_INFO))) {
+		struct ieee80211_ht_addt_info *ht_add_info =
+			(struct ieee80211_ht_addt_info *)ht_add_ie;
+		u16 cap = sband->ht_info.cap;
+		__le16 tmp;
+		u32 flags = local->hw.conf.channel->flags;
+
+		switch (ht_add_info->ht_param & IEEE80211_HT_IE_CHA_SEC_OFFSET) {
+		case IEEE80211_HT_IE_CHA_SEC_ABOVE:
+			if (flags & IEEE80211_CHAN_NO_FAT_ABOVE) {
+				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH;
+				cap &= ~IEEE80211_HT_CAP_SGI_40;
+			}
+			break;
+		case IEEE80211_HT_IE_CHA_SEC_BELOW:
+			if (flags & IEEE80211_CHAN_NO_FAT_BELOW) {
+				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH;
+				cap &= ~IEEE80211_HT_CAP_SGI_40;
+			}
+			break;
+		}
+
+		tmp = cpu_to_le16(cap);
+		pos = skb_put(skb, sizeof(struct ieee80211_ht_cap)+2);
+		*pos++ = WLAN_EID_HT_CAPABILITY;
+		*pos++ = sizeof(struct ieee80211_ht_cap);
+		memset(pos, 0, sizeof(struct ieee80211_ht_cap));
+		memcpy(pos, &tmp, sizeof(u16));
+		pos += sizeof(u16);
+		/* TODO: needs a define here for << 2 */
+		*pos++ = sband->ht_info.ampdu_factor |
+			 (sband->ht_info.ampdu_density << 2);
+		memcpy(pos, sband->ht_info.supp_mcs_set, 16);
+	}
+
+	kfree(ifsta->assocreq_ies);
+	ifsta->assocreq_ies_len = (skb->data + skb->len) - ies;
+	ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL);
+	if (ifsta->assocreq_ies)
+		memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+
+static void ieee80211_send_deauth(struct ieee80211_sub_if_data *sdata,
+				  struct ieee80211_if_sta *ifsta, u16 reason)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for deauth "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_DEAUTH);
+	skb_put(skb, 2);
+	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+static void ieee80211_send_disassoc(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_if_sta *ifsta, u16 reason)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for disassoc "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_DISASSOC);
+	skb_put(skb, 2);
+	mgmt->u.disassoc.reason_code = cpu_to_le16(reason);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
+					u8 dialog_token, u16 status, u16 policy,
+					u16 buf_size, u16 timeout)
+{
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u16 capab;
+
+	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer "
+		       "for addba resp frame\n", sdata->dev->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
+		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
+	else
+		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
+	mgmt->u.action.category = WLAN_CATEGORY_BACK;
+	mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
+	mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
+
+	capab = (u16)(policy << 1);	/* bit 1 aggregation policy */
+	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
+	capab |= (u16)(buf_size << 6);	/* bit 15:6 max size of aggregation */
+
+	mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
+	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
+	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_msrment_ie *request_ie,
+					const u8 *da, const u8 *bssid,
+					u8 dialog_token)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *msr_report;
+
+	skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom +
+				sizeof(struct ieee80211_msrment_ie));
+
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer for "
+				"measurement report frame\n", sdata->dev->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24);
+	memset(msr_report, 0, 24);
+	memcpy(msr_report->da, da, ETH_ALEN);
+	memcpy(msr_report->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(msr_report->bssid, bssid, ETH_ALEN);
+	msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
+	msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
+	msr_report->u.action.u.measurement.action_code =
+				WLAN_ACTION_SPCT_MSR_RPRT;
+	msr_report->u.action.u.measurement.dialog_token = dialog_token;
+
+	msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT;
+	msr_report->u.action.u.measurement.length =
+			sizeof(struct ieee80211_msrment_ie);
+
+	memset(&msr_report->u.action.u.measurement.msr_elem, 0,
+		sizeof(struct ieee80211_msrment_ie));
+	msr_report->u.action.u.measurement.msr_elem.token = request_ie->token;
+	msr_report->u.action.u.measurement.msr_elem.mode |=
+			IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
+	msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
 /* MLME */
 static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_sta_bss *bss)
@@ -429,379 +808,85 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 		sdata->bss_conf.dtim_period = bss->dtim_period;
 
 		changed |= ieee80211_handle_bss_capability(sdata, bss);
-
-		ieee80211_rx_bss_put(local, bss);
-	}
-
-	if (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) {
-		changed |= BSS_CHANGED_HT;
-		sdata->bss_conf.assoc_ht = 1;
-		sdata->bss_conf.ht_conf = &conf->ht_conf;
-		sdata->bss_conf.ht_bss_conf = &conf->ht_bss_conf;
-	}
-
-	ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
-	memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
-	ieee80211_sta_send_associnfo(sdata, ifsta);
-
-	ifsta->last_probe = jiffies;
-	ieee80211_led_assoc(local, 1);
-
-	sdata->bss_conf.assoc = 1;
-	ieee80211_bss_info_change_notify(sdata, changed);
-
-	netif_tx_start_all_queues(sdata->dev);
-	netif_carrier_on(sdata->dev);
-
-	ieee80211_sta_send_apinfo(sdata, ifsta);
-}
-
-static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta)
-{
-	DECLARE_MAC_BUF(mac);
-
-	ifsta->direct_probe_tries++;
-	if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
-		printk(KERN_DEBUG "%s: direct probe to AP %s timed out\n",
-		       sdata->dev->name, print_mac(mac, ifsta->bssid));
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
-		return;
-	}
-
-	printk(KERN_DEBUG "%s: direct probe to AP %s try %d\n",
-			sdata->dev->name, print_mac(mac, ifsta->bssid),
-			ifsta->direct_probe_tries);
-
-	ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
-
-	set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request);
-
-	/* Direct probe is sent to broadcast address as some APs
-	 * will not answer to direct packet in unassociated state.
-	 */
-	ieee80211_send_probe_req(sdata, NULL,
-				 ifsta->ssid, ifsta->ssid_len);
-
-	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
-}
-
-
-static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta)
-{
-	DECLARE_MAC_BUF(mac);
-
-	ifsta->auth_tries++;
-	if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
-		printk(KERN_DEBUG "%s: authentication with AP %s"
-		       " timed out\n",
-		       sdata->dev->name, print_mac(mac, ifsta->bssid));
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
-		return;
-	}
-
-	ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
-	printk(KERN_DEBUG "%s: authenticate with AP %s\n",
-	       sdata->dev->name, print_mac(mac, ifsta->bssid));
-
-	ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0);
-
-	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
-}
-
-static int ieee80211_compatible_rates(struct ieee80211_sta_bss *bss,
-				      struct ieee80211_supported_band *sband,
-				      u64 *rates)
-{
-	int i, j, count;
-	*rates = 0;
-	count = 0;
-	for (i = 0; i < bss->supp_rates_len; i++) {
-		int rate = (bss->supp_rates[i] & 0x7F) * 5;
-
-		for (j = 0; j < sband->n_bitrates; j++)
-			if (sband->bitrates[j].bitrate == rate) {
-				*rates |= BIT(j);
-				count++;
-				break;
-			}
-	}
-
-	return count;
-}
-
-static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
-				 struct ieee80211_if_sta *ifsta)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *ies, *ht_add_ie;
-	int i, len, count, rates_len, supp_rates_len;
-	u16 capab;
-	struct ieee80211_sta_bss *bss;
-	int wmm = 0;
-	struct ieee80211_supported_band *sband;
-	u64 rates = 0;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    sizeof(*mgmt) + 200 + ifsta->extra_ie_len +
-			    ifsta->ssid_len);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
-		       "frame\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
-	capab = ifsta->capab;
-
-	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) {
-		if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
-			capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
-		if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE))
-			capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
-	}
-
-	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
-				   local->hw.conf.channel->center_freq,
-				   ifsta->ssid, ifsta->ssid_len);
-	if (bss) {
-		if (bss->capability & WLAN_CAPABILITY_PRIVACY)
-			capab |= WLAN_CAPABILITY_PRIVACY;
-		if (bss->wmm_used)
-			wmm = 1;
-
-		/* get all rates supported by the device and the AP as
-		 * some APs don't like getting a superset of their rates
-		 * in the association request (e.g. D-Link DAP 1353 in
-		 * b-only mode) */
-		rates_len = ieee80211_compatible_rates(bss, sband, &rates);
-
-		if ((bss->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) &&
-		    (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT))
-			capab |= WLAN_CAPABILITY_SPECTRUM_MGMT;
-
-		ieee80211_rx_bss_put(local, bss);
-	} else {
-		rates = ~0;
-		rates_len = sband->n_bitrates;
-	}
-
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-
-	if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
-		skb_put(skb, 10);
-		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-						  IEEE80211_STYPE_REASSOC_REQ);
-		mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
-		mgmt->u.reassoc_req.listen_interval =
-				cpu_to_le16(local->hw.conf.listen_interval);
-		memcpy(mgmt->u.reassoc_req.current_ap, ifsta->prev_bssid,
-		       ETH_ALEN);
-	} else {
-		skb_put(skb, 4);
-		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-						  IEEE80211_STYPE_ASSOC_REQ);
-		mgmt->u.assoc_req.capab_info = cpu_to_le16(capab);
-		mgmt->u.reassoc_req.listen_interval =
-				cpu_to_le16(local->hw.conf.listen_interval);
-	}
-
-	/* SSID */
-	ies = pos = skb_put(skb, 2 + ifsta->ssid_len);
-	*pos++ = WLAN_EID_SSID;
-	*pos++ = ifsta->ssid_len;
-	memcpy(pos, ifsta->ssid, ifsta->ssid_len);
-
-	/* add all rates which were marked to be used above */
-	supp_rates_len = rates_len;
-	if (supp_rates_len > 8)
-		supp_rates_len = 8;
-
-	len = sband->n_bitrates;
-	pos = skb_put(skb, supp_rates_len + 2);
-	*pos++ = WLAN_EID_SUPP_RATES;
-	*pos++ = supp_rates_len;
-
-	count = 0;
-	for (i = 0; i < sband->n_bitrates; i++) {
-		if (BIT(i) & rates) {
-			int rate = sband->bitrates[i].bitrate;
-			*pos++ = (u8) (rate / 5);
-			if (++count == 8)
-				break;
-		}
-	}
-
-	if (rates_len > count) {
-		pos = skb_put(skb, rates_len - count + 2);
-		*pos++ = WLAN_EID_EXT_SUPP_RATES;
-		*pos++ = rates_len - count;
-
-		for (i++; i < sband->n_bitrates; i++) {
-			if (BIT(i) & rates) {
-				int rate = sband->bitrates[i].bitrate;
-				*pos++ = (u8) (rate / 5);
-			}
-		}
-	}
-
-	if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) {
-		/* 1. power capabilities */
-		pos = skb_put(skb, 4);
-		*pos++ = WLAN_EID_PWR_CAPABILITY;
-		*pos++ = 2;
-		*pos++ = 0; /* min tx power */
-		*pos++ = local->hw.conf.channel->max_power; /* max tx power */
-
-		/* 2. supported channels */
-		/* TODO: get this in reg domain format */
-		pos = skb_put(skb, 2 * sband->n_channels + 2);
-		*pos++ = WLAN_EID_SUPPORTED_CHANNELS;
-		*pos++ = 2 * sband->n_channels;
-		for (i = 0; i < sband->n_channels; i++) {
-			*pos++ = ieee80211_frequency_to_channel(
-					sband->channels[i].center_freq);
-			*pos++ = 1; /* one channel in the subband*/
-		}
-	}
-
-	if (ifsta->extra_ie) {
-		pos = skb_put(skb, ifsta->extra_ie_len);
-		memcpy(pos, ifsta->extra_ie, ifsta->extra_ie_len);
+
+		ieee80211_rx_bss_put(local, bss);
 	}
 
-	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
-		pos = skb_put(skb, 9);
-		*pos++ = WLAN_EID_VENDOR_SPECIFIC;
-		*pos++ = 7; /* len */
-		*pos++ = 0x00; /* Microsoft OUI 00:50:F2 */
-		*pos++ = 0x50;
-		*pos++ = 0xf2;
-		*pos++ = 2; /* WME */
-		*pos++ = 0; /* WME info */
-		*pos++ = 1; /* WME ver */
-		*pos++ = 0;
+	if (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) {
+		changed |= BSS_CHANGED_HT;
+		sdata->bss_conf.assoc_ht = 1;
+		sdata->bss_conf.ht_conf = &conf->ht_conf;
+		sdata->bss_conf.ht_bss_conf = &conf->ht_bss_conf;
 	}
 
-	/* wmm support is a must to HT */
-	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) &&
-	    sband->ht_info.ht_supported &&
-	    (ht_add_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_EXTRA_INFO))) {
-		struct ieee80211_ht_addt_info *ht_add_info =
-			(struct ieee80211_ht_addt_info *)ht_add_ie;
-		u16 cap = sband->ht_info.cap;
-		__le16 tmp;
-		u32 flags = local->hw.conf.channel->flags;
+	ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
+	memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
+	ieee80211_sta_send_associnfo(sdata, ifsta);
 
-		switch (ht_add_info->ht_param & IEEE80211_HT_IE_CHA_SEC_OFFSET) {
-		case IEEE80211_HT_IE_CHA_SEC_ABOVE:
-			if (flags & IEEE80211_CHAN_NO_FAT_ABOVE) {
-				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH;
-				cap &= ~IEEE80211_HT_CAP_SGI_40;
-			}
-			break;
-		case IEEE80211_HT_IE_CHA_SEC_BELOW:
-			if (flags & IEEE80211_CHAN_NO_FAT_BELOW) {
-				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH;
-				cap &= ~IEEE80211_HT_CAP_SGI_40;
-			}
-			break;
-		}
+	ifsta->last_probe = jiffies;
+	ieee80211_led_assoc(local, 1);
 
-		tmp = cpu_to_le16(cap);
-		pos = skb_put(skb, sizeof(struct ieee80211_ht_cap)+2);
-		*pos++ = WLAN_EID_HT_CAPABILITY;
-		*pos++ = sizeof(struct ieee80211_ht_cap);
-		memset(pos, 0, sizeof(struct ieee80211_ht_cap));
-		memcpy(pos, &tmp, sizeof(u16));
-		pos += sizeof(u16);
-		/* TODO: needs a define here for << 2 */
-		*pos++ = sband->ht_info.ampdu_factor |
-			 (sband->ht_info.ampdu_density << 2);
-		memcpy(pos, sband->ht_info.supp_mcs_set, 16);
-	}
+	sdata->bss_conf.assoc = 1;
+	ieee80211_bss_info_change_notify(sdata, changed);
 
-	kfree(ifsta->assocreq_ies);
-	ifsta->assocreq_ies_len = (skb->data + skb->len) - ies;
-	ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL);
-	if (ifsta->assocreq_ies)
-		memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len);
+	netif_tx_start_all_queues(sdata->dev);
+	netif_carrier_on(sdata->dev);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_sta_send_apinfo(sdata, ifsta);
 }
 
-
-static void ieee80211_send_deauth(struct ieee80211_sub_if_data *sdata,
-				  struct ieee80211_if_sta *ifsta, u16 reason)
+static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
+	DECLARE_MAC_BUF(mac);
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for deauth "
-		       "frame\n", sdata->dev->name);
+	ifsta->direct_probe_tries++;
+	if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
+		printk(KERN_DEBUG "%s: direct probe to AP %s timed out\n",
+		       sdata->dev->name, print_mac(mac, ifsta->bssid));
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
 		return;
 	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
 
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_DEAUTH);
-	skb_put(skb, 2);
-	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
+	printk(KERN_DEBUG "%s: direct probe to AP %s try %d\n",
+			sdata->dev->name, print_mac(mac, ifsta->bssid),
+			ifsta->direct_probe_tries);
 
-	ieee80211_sta_tx(sdata, skb, 0);
-}
+	ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
 
-static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
-{
-	if (!sdata || !sdata->default_key ||
-	    sdata->default_key->conf.alg != ALG_WEP)
-		return 0;
-	return 1;
+	set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request);
+
+	/* Direct probe is sent to broadcast address as some APs
+	 * will not answer to direct packet in unassociated state.
+	 */
+	ieee80211_send_probe_req(sdata, NULL,
+				 ifsta->ssid, ifsta->ssid_len);
+
+	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
 }
 
-static void ieee80211_send_disassoc(struct ieee80211_sub_if_data *sdata,
-				    struct ieee80211_if_sta *ifsta, u16 reason)
+
+static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
+	DECLARE_MAC_BUF(mac);
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for disassoc "
-		       "frame\n", sdata->dev->name);
+	ifsta->auth_tries++;
+	if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
+		printk(KERN_DEBUG "%s: authentication with AP %s"
+		       " timed out\n",
+		       sdata->dev->name, print_mac(mac, ifsta->bssid));
+		ifsta->state = IEEE80211_STA_MLME_DISABLED;
 		return;
 	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
 
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_DISASSOC);
-	skb_put(skb, 2);
-	mgmt->u.disassoc.reason_code = cpu_to_le16(reason);
+	ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+	printk(KERN_DEBUG "%s: authenticate with AP %s\n",
+	       sdata->dev->name, print_mac(mac, ifsta->bssid));
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0);
+
+	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
 }
 
 static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
@@ -864,6 +949,14 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	sta_info_destroy(sta);
 }
 
+static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
+{
+	if (!sdata || !sdata->default_key ||
+	    sdata->default_key->conf.alg != ALG_WEP)
+		return 0;
+	return 1;
+}
+
 static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_if_sta *ifsta)
 {
@@ -1009,54 +1102,6 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 			    elems.challenge_len + 2, 1);
 }
 
-static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
-					u8 dialog_token, u16 status, u16 policy,
-					u16 buf_size, u16 timeout)
-{
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u16 capab;
-
-	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
-
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer "
-		       "for addba resp frame\n", sdata->dev->name);
-		return;
-	}
-
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
-		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
-	else
-		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_ACTION);
-
-	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
-	mgmt->u.action.category = WLAN_CATEGORY_BACK;
-	mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
-	mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
-
-	capab = (u16)(policy << 1);	/* bit 1 aggregation policy */
-	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
-	capab |= (u16)(buf_size << 6);	/* bit 15:6 max size of aggregation */
-
-	mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
-	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
-	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
-
-	ieee80211_sta_tx(sdata, skb, 0);
-
-	return;
-}
-
 /*
  * After accepting the AddBA Request we activated a timer,
  * resetting it after each frame that arrives from the originator.
@@ -1329,53 +1374,6 @@ static void ieee80211_sta_process_delba(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 }
 
-static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
-					struct ieee80211_msrment_ie *request_ie,
-					const u8 *da, const u8 *bssid,
-					u8 dialog_token)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *msr_report;
-
-	skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom +
-				sizeof(struct ieee80211_msrment_ie));
-
-	if (!skb) {
-		printk(KERN_ERR "%s: failed to allocate buffer for "
-				"measurement report frame\n", sdata->dev->name);
-		return;
-	}
-
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-	msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24);
-	memset(msr_report, 0, 24);
-	memcpy(msr_report->da, da, ETH_ALEN);
-	memcpy(msr_report->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(msr_report->bssid, bssid, ETH_ALEN);
-	msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-						IEEE80211_STYPE_ACTION);
-
-	skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
-	msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
-	msr_report->u.action.u.measurement.action_code =
-				WLAN_ACTION_SPCT_MSR_RPRT;
-	msr_report->u.action.u.measurement.dialog_token = dialog_token;
-
-	msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT;
-	msr_report->u.action.u.measurement.length =
-			sizeof(struct ieee80211_msrment_ie);
-
-	memset(&msr_report->u.action.u.measurement.msr_elem, 0,
-		sizeof(struct ieee80211_msrment_ie));
-	msr_report->u.action.u.measurement.msr_elem.token = request_ie->token;
-	msr_report->u.action.u.measurement.msr_elem.mode |=
-			IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
-	msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
 static void ieee80211_sta_process_measurement_req(struct ieee80211_sub_if_data *sdata,
 						struct ieee80211_mgmt *mgmt,
 						size_t len)
-- 
cgit v1.2.3


From ef422bc0ae934e6a46dfa63f0e27cad83b94234f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 10:58:25 +0200
Subject: mac80211: consolidate deauth/disassoc

deauth and disassoc frames are completely identical so there's
little point in having two functions to send them rather than
one that gets a parameter. This same a bit of code size.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 47 +++++++++++------------------------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e917e1b7263..62357a20885 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -416,17 +416,18 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 }
 
 
-static void ieee80211_send_deauth(struct ieee80211_sub_if_data *sdata,
-				  struct ieee80211_if_sta *ifsta, u16 reason)
+static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
+					   u16 stype, u16 reason)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
 	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for deauth "
-		       "frame\n", sdata->dev->name);
+		printk(KERN_DEBUG "%s: failed to allocate buffer for "
+		       "deauth/disassoc frame\n", sdata->dev->name);
 		return;
 	}
 	skb_reserve(skb, local->hw.extra_tx_headroom);
@@ -436,42 +437,14 @@ static void ieee80211_send_deauth(struct ieee80211_sub_if_data *sdata,
 	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
 	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_DEAUTH);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
 	skb_put(skb, 2);
+	/* u.deauth.reason_code == u.disassoc.reason_code */
 	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
 
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-static void ieee80211_send_disassoc(struct ieee80211_sub_if_data *sdata,
-				    struct ieee80211_if_sta *ifsta, u16 reason)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for disassoc "
-		       "frame\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_DISASSOC);
-	skb_put(skb, 2);
-	mgmt->u.disassoc.reason_code = cpu_to_le16(reason);
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
 static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
 					u8 dialog_token, u16 status, u16 policy,
 					u16 buf_size, u16 timeout)
@@ -919,9 +892,11 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	if (self_disconnected) {
 		if (deauth)
-			ieee80211_send_deauth(sdata, ifsta, reason);
+			ieee80211_send_deauth_disassoc(sdata,
+				IEEE80211_STYPE_DEAUTH, reason);
 		else
-			ieee80211_send_disassoc(sdata, ifsta, reason);
+			ieee80211_send_deauth_disassoc(sdata,
+				IEEE80211_STYPE_DISASSOC, reason);
 	}
 
 	ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
-- 
cgit v1.2.3


From 3d35f7c6874d83063d19de0cdb4e503ff4471098 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 12:54:11 +0200
Subject: mac80211: split ieee80211_sta_def_wmm_params

Cleans up the code a bit and prepares for the next patch
that will use the function elsewhere.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 48 +++++++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 62357a20885..f754ad273f9 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -539,13 +539,38 @@ static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_da
 }
 
 /* MLME */
+static void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_queue_params qparam;
+	int i;
+
+	if (!local->ops->conf_tx)
+		return;
+
+	memset(&qparam, 0, sizeof(qparam));
+
+	qparam.aifs = 2;
+
+	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+	    !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE))
+		qparam.cw_min = 31;
+	else
+		qparam.cw_min = 15;
+
+	qparam.cw_max = 1023;
+	qparam.txop = 0;
+
+	for (i = 0; i < local_to_hw(local)->queues; i++)
+		local->ops->conf_tx(local_to_hw(local), i, &qparam);
+}
+
 static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_sta_bss *bss)
 {
 	struct ieee80211_local *local = sdata->local;
 	int i, have_higher_than_11mbit = 0;
 
-
 	/* cf. IEEE 802.11 9.2.12 */
 	for (i = 0; i < bss->supp_rates_len; i++)
 		if ((bss->supp_rates[i] & 0x7f) * 5 > 110)
@@ -557,26 +582,7 @@ static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 	else
 		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
 
-
-	if (local->ops->conf_tx) {
-		struct ieee80211_tx_queue_params qparam;
-
-		memset(&qparam, 0, sizeof(qparam));
-
-		qparam.aifs = 2;
-
-		if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
-		    !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE))
-			qparam.cw_min = 31;
-		else
-			qparam.cw_min = 15;
-
-		qparam.cw_max = 1023;
-		qparam.txop = 0;
-
-		for (i = 0; i < local_to_hw(local)->queues; i++)
-			local->ops->conf_tx(local_to_hw(local), i, &qparam);
-	}
+	ieee80211_set_wmm_default(sdata);
 }
 
 static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
-- 
cgit v1.2.3


From 5825fe100d654fff89aa67a1e202af1f8a7f0ad0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 12:56:01 +0200
Subject: mac80211: initialise queue QoS parameters at hw start

When hardware is started it might be in a confused state with
respect to queue QoS parameters. This patch changes mac80211
to set sane defaults right after the hardware is brought up.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Michael Buesch <mb@bu3sch.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/main.c        |  9 ++++++++-
 net/mac80211/mlme.c        | 26 --------------------------
 net/mac80211/util.c        | 26 ++++++++++++++++++++++++++
 4 files changed, 35 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 792c09ca08e..b10e70715db 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -987,6 +987,7 @@ int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
 			     int rate, int erp, int short_preamble);
 void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
 				     struct ieee80211_hdr *hdr);
+void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata);
 
 #ifdef CONFIG_MAC80211_NOINLINE
 #define debug_noinline noinline
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index f90254a5948..6a7f4fae18c 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -399,8 +399,15 @@ static int ieee80211_open(struct net_device *dev)
 		atomic_inc(&local->iff_promiscs);
 
 	local->open_count++;
-	if (need_hw_reconfig)
+	if (need_hw_reconfig) {
 		ieee80211_hw_config(local);
+		/*
+		 * set default queue parameters so drivers don't
+		 * need to initialise the hardware if the hardware
+		 * doesn't start up with sane defaults
+		 */
+		ieee80211_set_wmm_default(sdata);
+	}
 
 	/*
 	 * ieee80211_sta_work is disabled while network interface
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f754ad273f9..c22dcd605e4 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -539,32 +539,6 @@ static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_da
 }
 
 /* MLME */
-static void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_tx_queue_params qparam;
-	int i;
-
-	if (!local->ops->conf_tx)
-		return;
-
-	memset(&qparam, 0, sizeof(qparam));
-
-	qparam.aifs = 2;
-
-	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
-	    !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE))
-		qparam.cw_min = 31;
-	else
-		qparam.cw_min = 15;
-
-	qparam.cw_max = 1023;
-	qparam.txop = 0;
-
-	for (i = 0; i < local_to_hw(local)->queues; i++)
-		local->ops->conf_tx(local_to_hw(local), i, &qparam);
-}
-
 static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_sta_bss *bss)
 {
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index e19c74cada3..55be3ef5c75 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -572,3 +572,29 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 		pos += elen;
 	}
 }
+
+void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_queue_params qparam;
+	int i;
+
+	if (!local->ops->conf_tx)
+		return;
+
+	memset(&qparam, 0, sizeof(qparam));
+
+	qparam.aifs = 2;
+
+	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+	    !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE))
+		qparam.cw_min = 31;
+	else
+		qparam.cw_min = 15;
+
+	qparam.cw_max = 1023;
+	qparam.txop = 0;
+
+	for (i = 0; i < local_to_hw(local)->queues; i++)
+		local->ops->conf_tx(local_to_hw(local), i, &qparam);
+}
-- 
cgit v1.2.3


From de1ede7ac3bd300f9aa565d0f93f6cf9ba74bb1a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 14:42:50 +0200
Subject: mac80211: make BA session handling independent of STA mode

The aggregation handling isn't dependent on anything related to our
STA-mode implementation, and doesn't need to depend on it for frame
processing. This patch moves the relevant code to ht.c and adds a
hook in rx.c. For now, the relevant action frames are only processed
in STA/IBSS modes, but that's now something we can easily change.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ht.c          | 303 +++++++++++++++++++++++++++++++++++++++-
 net/mac80211/ieee80211_i.h |  39 +++---
 net/mac80211/mlme.c        | 340 ---------------------------------------------
 net/mac80211/rx.c          |  60 ++++++++
 4 files changed, 380 insertions(+), 362 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c72b3fe3ccd..7e93e1079ad 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -66,9 +66,10 @@ int ieee80211_ht_addt_info_ie_to_ht_bss_info(
 	return 0;
 }
 
-void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da,
-				u16 tid, u8 dialog_token, u16 start_seq_num,
-				u16 agg_size, u16 timeout)
+static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
+					 const u8 *da, u16 tid,
+					 u8 dialog_token, u16 start_seq_num,
+					 u16 agg_size, u16 timeout)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
@@ -115,8 +116,55 @@ void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid,
-			  u16 initiator, u16 reason_code)
+static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
+				      u8 dialog_token, u16 status, u16 policy,
+				      u16 buf_size, u16 timeout)
+{
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u16 capab;
+
+	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer "
+		       "for addba resp frame\n", sdata->dev->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
+		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
+	else
+		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
+	mgmt->u.action.category = WLAN_CATEGORY_BACK;
+	mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
+	mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
+
+	capab = (u16)(policy << 1);	/* bit 1 aggregation policy */
+	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
+	capab |= (u16)(buf_size << 6);	/* bit 15:6 max size of aggregation */
+
+	mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
+	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
+	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+static void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
+				 const u8 *da, u16 tid,
+				 u16 initiator, u16 reason_code)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
@@ -263,7 +311,7 @@ void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *r
  * add Block Ack response will arrive from the recipient.
  * If this timer expires sta_addba_resp_timer_expired will be executed.
  */
-void sta_addba_resp_timer_expired(unsigned long data)
+static void sta_addba_resp_timer_expired(unsigned long data)
 {
 	/* not an elegant detour, but there is no choice as the timer passes
 	 * only one argument, and both sta_info and TID are needed, so init
@@ -699,3 +747,246 @@ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_hw *hw,
 	tasklet_schedule(&local->tasklet);
 }
 EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe);
+
+/*
+ * After accepting the AddBA Request we activated a timer,
+ * resetting it after each frame that arrives from the originator.
+ * if this timer expires ieee80211_sta_stop_rx_ba_session will be executed.
+ */
+static void sta_rx_agg_session_timer_expired(unsigned long data)
+{
+	/* not an elegant detour, but there is no choice as the timer passes
+	 * only one argument, and various sta_info are needed here, so init
+	 * flow in sta_info_create gives the TID as data, while the timer_to_id
+	 * array gives the sta through container_of */
+	u8 *ptid = (u8 *)data;
+	u8 *timer_to_id = ptid - *ptid;
+	struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+					 timer_to_tid[0]);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
+#endif
+	ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
+					 (u16)*ptid, WLAN_BACK_TIMER,
+					 WLAN_REASON_QSTA_TIMEOUT);
+}
+
+void ieee80211_process_addba_request(struct ieee80211_local *local,
+				     struct sta_info *sta,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	struct ieee80211_conf *conf = &hw->conf;
+	struct tid_ampdu_rx *tid_agg_rx;
+	u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status;
+	u8 dialog_token;
+	int ret = -EOPNOTSUPP;
+	DECLARE_MAC_BUF(mac);
+
+	/* extract session parameters from addba request frame */
+	dialog_token = mgmt->u.action.u.addba_req.dialog_token;
+	timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout);
+	start_seq_num =
+		le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4;
+
+	capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab);
+	ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1;
+	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
+	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+
+	status = WLAN_STATUS_REQUEST_DECLINED;
+
+	/* sanity check for incoming parameters:
+	 * check if configuration can support the BA policy
+	 * and if buffer size does not exceeds max value */
+	if (((ba_policy != 1)
+		&& (!(conf->ht_conf.cap & IEEE80211_HT_CAP_DELAY_BA)))
+		|| (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
+		status = WLAN_STATUS_INVALID_QOS_PARAM;
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_DEBUG "AddBA Req with bad params from "
+				"%s on tid %u. policy %d, buffer size %d\n",
+				print_mac(mac, mgmt->sa), tid, ba_policy,
+				buf_size);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		goto end_no_lock;
+	}
+	/* determine default buffer size */
+	if (buf_size == 0) {
+		struct ieee80211_supported_band *sband;
+
+		sband = local->hw.wiphy->bands[conf->channel->band];
+		buf_size = IEEE80211_MIN_AMPDU_BUF;
+		buf_size = buf_size << sband->ht_info.ampdu_factor;
+	}
+
+
+	/* examine state machine */
+	spin_lock_bh(&sta->lock);
+
+	if (sta->ampdu_mlme.tid_state_rx[tid] != HT_AGG_STATE_IDLE) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_DEBUG "unexpected AddBA Req from "
+				"%s on tid %u\n",
+				print_mac(mac, mgmt->sa), tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		goto end;
+	}
+
+	/* prepare A-MPDU MLME for Rx aggregation */
+	sta->ampdu_mlme.tid_rx[tid] =
+			kmalloc(sizeof(struct tid_ampdu_rx), GFP_ATOMIC);
+	if (!sta->ampdu_mlme.tid_rx[tid]) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_ERR "allocate rx mlme to tid %d failed\n",
+					tid);
+#endif
+		goto end;
+	}
+	/* rx timer */
+	sta->ampdu_mlme.tid_rx[tid]->session_timer.function =
+				sta_rx_agg_session_timer_expired;
+	sta->ampdu_mlme.tid_rx[tid]->session_timer.data =
+				(unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&sta->ampdu_mlme.tid_rx[tid]->session_timer);
+
+	tid_agg_rx = sta->ampdu_mlme.tid_rx[tid];
+
+	/* prepare reordering buffer */
+	tid_agg_rx->reorder_buf =
+		kmalloc(buf_size * sizeof(struct sk_buff *), GFP_ATOMIC);
+	if (!tid_agg_rx->reorder_buf) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_ERR "can not allocate reordering buffer "
+			       "to tid %d\n", tid);
+#endif
+		kfree(sta->ampdu_mlme.tid_rx[tid]);
+		goto end;
+	}
+	memset(tid_agg_rx->reorder_buf, 0,
+		buf_size * sizeof(struct sk_buff *));
+
+	if (local->ops->ampdu_action)
+		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START,
+					       sta->addr, tid, &start_seq_num);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Rx A-MPDU request on tid %d result %d\n", tid, ret);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	if (ret) {
+		kfree(tid_agg_rx->reorder_buf);
+		kfree(tid_agg_rx);
+		sta->ampdu_mlme.tid_rx[tid] = NULL;
+		goto end;
+	}
+
+	/* change state and send addba resp */
+	sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_OPERATIONAL;
+	tid_agg_rx->dialog_token = dialog_token;
+	tid_agg_rx->ssn = start_seq_num;
+	tid_agg_rx->head_seq_num = start_seq_num;
+	tid_agg_rx->buf_size = buf_size;
+	tid_agg_rx->timeout = timeout;
+	tid_agg_rx->stored_mpdu_num = 0;
+	status = WLAN_STATUS_SUCCESS;
+end:
+	spin_unlock_bh(&sta->lock);
+
+end_no_lock:
+	ieee80211_send_addba_resp(sta->sdata, sta->addr, tid,
+				  dialog_token, status, 1, buf_size, timeout);
+}
+
+void ieee80211_process_addba_resp(struct ieee80211_local *local,
+				  struct sta_info *sta,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	u16 capab;
+	u16 tid;
+	u8 *state;
+
+	capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
+	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
+
+	state = &sta->ampdu_mlme.tid_state_tx[tid];
+
+	spin_lock_bh(&sta->lock);
+
+	if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
+		spin_unlock_bh(&sta->lock);
+		return;
+	}
+
+	if (mgmt->u.action.u.addba_resp.dialog_token !=
+		sta->ampdu_mlme.tid_tx[tid]->dialog_token) {
+		spin_unlock_bh(&sta->lock);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		return;
+	}
+
+	del_timer_sync(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "switched off addBA timer for tid %d \n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+	if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
+			== WLAN_STATUS_SUCCESS) {
+		*state |= HT_ADDBA_RECEIVED_MSK;
+		sta->ampdu_mlme.addba_req_num[tid] = 0;
+
+		if (*state == HT_AGG_STATE_OPERATIONAL)
+			ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+
+		spin_unlock_bh(&sta->lock);
+	} else {
+		sta->ampdu_mlme.addba_req_num[tid]++;
+		/* this will allow the state check in stop_BA_session */
+		*state = HT_AGG_STATE_OPERATIONAL;
+		spin_unlock_bh(&sta->lock);
+		ieee80211_stop_tx_ba_session(hw, sta->addr, tid,
+					     WLAN_BACK_INITIATOR);
+	}
+}
+
+void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
+			     struct sta_info *sta,
+			     struct ieee80211_mgmt *mgmt, size_t len)
+{
+	struct ieee80211_local *local = sdata->local;
+	u16 tid, params;
+	u16 initiator;
+	DECLARE_MAC_BUF(mac);
+
+	params = le16_to_cpu(mgmt->u.action.u.delba.params);
+	tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12;
+	initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11;
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	if (net_ratelimit())
+		printk(KERN_DEBUG "delba from %s (%s) tid %d reason code %d\n",
+			print_mac(mac, mgmt->sa),
+			initiator ? "initiator" : "recipient", tid,
+			mgmt->u.action.u.delba.reason_code);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	if (initiator == WLAN_BACK_INITIATOR)
+		ieee80211_sta_stop_rx_ba_session(sdata, sta->addr, tid,
+						 WLAN_BACK_INITIATOR, 0);
+	else { /* WLAN_BACK_RECIPIENT */
+		spin_lock_bh(&sta->lock);
+		sta->ampdu_mlme.tid_state_tx[tid] =
+				HT_AGG_STATE_OPERATIONAL;
+		spin_unlock_bh(&sta->lock);
+		ieee80211_stop_tx_ba_session(&local->hw, sta->addr, tid,
+					     WLAN_BACK_RECIPIENT);
+	}
+}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b10e70715db..60ec7ad2764 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -909,22 +909,6 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason);
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u32 changed);
 u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
-int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
-				   struct ieee80211_ht_info *ht_info);
-int ieee80211_ht_addt_info_ie_to_ht_bss_info(
-			struct ieee80211_ht_addt_info *ht_add_info_ie,
-			struct ieee80211_ht_bss_info *bss_info);
-void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da,
-				  u16 tid, u8 dialog_token, u16 start_seq_num,
-				  u16 agg_size, u16 timeout);
-void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid,
-				u16 initiator, u16 reason_code);
-void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn);
-
-void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *da,
-				u16 tid, u16 initiator, u16 reason);
-void sta_addba_resp_timer_expired(unsigned long data);
-void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr);
 u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 			    struct ieee802_11_elems *elems,
 			    enum ieee80211_band band);
@@ -977,6 +961,29 @@ int ieee80211_master_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
+/* HT */
+int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
+				   struct ieee80211_ht_info *ht_info);
+int ieee80211_ht_addt_info_ie_to_ht_bss_info(
+			struct ieee80211_ht_addt_info *ht_add_info_ie,
+			struct ieee80211_ht_bss_info *bss_info);
+void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn);
+
+void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *da,
+				u16 tid, u16 initiator, u16 reason);
+void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr);
+void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
+			     struct sta_info *sta,
+			     struct ieee80211_mgmt *mgmt, size_t len);
+void ieee80211_process_addba_resp(struct ieee80211_local *local,
+				  struct sta_info *sta,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len);
+void ieee80211_process_addba_request(struct ieee80211_local *local,
+				     struct sta_info *sta,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len);
+
 /* utility functions/constants */
 extern void *mac80211_wiphy_privid; /* for wiphy privid */
 extern const unsigned char rfc1042_header[6];
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c22dcd605e4..25f90f7ccf3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -445,52 +445,6 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
-					u8 dialog_token, u16 status, u16 policy,
-					u16 buf_size, u16 timeout)
-{
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u16 capab;
-
-	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
-
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer "
-		       "for addba resp frame\n", sdata->dev->name);
-		return;
-	}
-
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, da, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
-		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
-	else
-		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_ACTION);
-
-	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
-	mgmt->u.action.category = WLAN_CATEGORY_BACK;
-	mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
-	mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
-
-	capab = (u16)(policy << 1);	/* bit 1 aggregation policy */
-	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
-	capab |= (u16)(buf_size << 6);	/* bit 15:6 max size of aggregation */
-
-	mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
-	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
-	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
 static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
 					struct ieee80211_msrment_ie *request_ie,
 					const u8 *da, const u8 *bssid,
@@ -1057,278 +1011,6 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 			    elems.challenge_len + 2, 1);
 }
 
-/*
- * After accepting the AddBA Request we activated a timer,
- * resetting it after each frame that arrives from the originator.
- * if this timer expires ieee80211_sta_stop_rx_ba_session will be executed.
- */
-static void sta_rx_agg_session_timer_expired(unsigned long data)
-{
-	/* not an elegant detour, but there is no choice as the timer passes
-	 * only one argument, and various sta_info are needed here, so init
-	 * flow in sta_info_create gives the TID as data, while the timer_to_id
-	 * array gives the sta through container_of */
-	u8 *ptid = (u8 *)data;
-	u8 *timer_to_id = ptid - *ptid;
-	struct sta_info *sta = container_of(timer_to_id, struct sta_info,
-					 timer_to_tid[0]);
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
-#endif
-	ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
-					 (u16)*ptid, WLAN_BACK_TIMER,
-					 WLAN_REASON_QSTA_TIMEOUT);
-}
-
-static void ieee80211_sta_process_addba_request(struct ieee80211_local *local,
-						struct ieee80211_mgmt *mgmt,
-						size_t len)
-{
-	struct ieee80211_hw *hw = &local->hw;
-	struct ieee80211_conf *conf = &hw->conf;
-	struct sta_info *sta;
-	struct tid_ampdu_rx *tid_agg_rx;
-	u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status;
-	u8 dialog_token;
-	int ret = -EOPNOTSUPP;
-	DECLARE_MAC_BUF(mac);
-
-	rcu_read_lock();
-
-	sta = sta_info_get(local, mgmt->sa);
-	if (!sta) {
-		rcu_read_unlock();
-		return;
-	}
-
-	/* extract session parameters from addba request frame */
-	dialog_token = mgmt->u.action.u.addba_req.dialog_token;
-	timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout);
-	start_seq_num =
-		le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4;
-
-	capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab);
-	ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1;
-	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
-	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
-
-	status = WLAN_STATUS_REQUEST_DECLINED;
-
-	/* sanity check for incoming parameters:
-	 * check if configuration can support the BA policy
-	 * and if buffer size does not exceeds max value */
-	if (((ba_policy != 1)
-		&& (!(conf->ht_conf.cap & IEEE80211_HT_CAP_DELAY_BA)))
-		|| (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
-		status = WLAN_STATUS_INVALID_QOS_PARAM;
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		if (net_ratelimit())
-			printk(KERN_DEBUG "AddBA Req with bad params from "
-				"%s on tid %u. policy %d, buffer size %d\n",
-				print_mac(mac, mgmt->sa), tid, ba_policy,
-				buf_size);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-		goto end_no_lock;
-	}
-	/* determine default buffer size */
-	if (buf_size == 0) {
-		struct ieee80211_supported_band *sband;
-
-		sband = local->hw.wiphy->bands[conf->channel->band];
-		buf_size = IEEE80211_MIN_AMPDU_BUF;
-		buf_size = buf_size << sband->ht_info.ampdu_factor;
-	}
-
-
-	/* examine state machine */
-	spin_lock_bh(&sta->lock);
-
-	if (sta->ampdu_mlme.tid_state_rx[tid] != HT_AGG_STATE_IDLE) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		if (net_ratelimit())
-			printk(KERN_DEBUG "unexpected AddBA Req from "
-				"%s on tid %u\n",
-				print_mac(mac, mgmt->sa), tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-		goto end;
-	}
-
-	/* prepare A-MPDU MLME for Rx aggregation */
-	sta->ampdu_mlme.tid_rx[tid] =
-			kmalloc(sizeof(struct tid_ampdu_rx), GFP_ATOMIC);
-	if (!sta->ampdu_mlme.tid_rx[tid]) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		if (net_ratelimit())
-			printk(KERN_ERR "allocate rx mlme to tid %d failed\n",
-					tid);
-#endif
-		goto end;
-	}
-	/* rx timer */
-	sta->ampdu_mlme.tid_rx[tid]->session_timer.function =
-				sta_rx_agg_session_timer_expired;
-	sta->ampdu_mlme.tid_rx[tid]->session_timer.data =
-				(unsigned long)&sta->timer_to_tid[tid];
-	init_timer(&sta->ampdu_mlme.tid_rx[tid]->session_timer);
-
-	tid_agg_rx = sta->ampdu_mlme.tid_rx[tid];
-
-	/* prepare reordering buffer */
-	tid_agg_rx->reorder_buf =
-		kmalloc(buf_size * sizeof(struct sk_buff *), GFP_ATOMIC);
-	if (!tid_agg_rx->reorder_buf) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		if (net_ratelimit())
-			printk(KERN_ERR "can not allocate reordering buffer "
-			       "to tid %d\n", tid);
-#endif
-		kfree(sta->ampdu_mlme.tid_rx[tid]);
-		goto end;
-	}
-	memset(tid_agg_rx->reorder_buf, 0,
-		buf_size * sizeof(struct sk_buff *));
-
-	if (local->ops->ampdu_action)
-		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START,
-					       sta->addr, tid, &start_seq_num);
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "Rx A-MPDU request on tid %d result %d\n", tid, ret);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-
-	if (ret) {
-		kfree(tid_agg_rx->reorder_buf);
-		kfree(tid_agg_rx);
-		sta->ampdu_mlme.tid_rx[tid] = NULL;
-		goto end;
-	}
-
-	/* change state and send addba resp */
-	sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_OPERATIONAL;
-	tid_agg_rx->dialog_token = dialog_token;
-	tid_agg_rx->ssn = start_seq_num;
-	tid_agg_rx->head_seq_num = start_seq_num;
-	tid_agg_rx->buf_size = buf_size;
-	tid_agg_rx->timeout = timeout;
-	tid_agg_rx->stored_mpdu_num = 0;
-	status = WLAN_STATUS_SUCCESS;
-end:
-	spin_unlock_bh(&sta->lock);
-
-end_no_lock:
-	ieee80211_send_addba_resp(sta->sdata, sta->addr, tid,
-				  dialog_token, status, 1, buf_size, timeout);
-	rcu_read_unlock();
-}
-
-static void ieee80211_sta_process_addba_resp(struct ieee80211_local *local,
-					     struct ieee80211_mgmt *mgmt,
-					     size_t len)
-{
-	struct ieee80211_hw *hw = &local->hw;
-	struct sta_info *sta;
-	u16 capab;
-	u16 tid;
-	u8 *state;
-
-	rcu_read_lock();
-
-	sta = sta_info_get(local, mgmt->sa);
-	if (!sta) {
-		rcu_read_unlock();
-		return;
-	}
-
-	capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
-	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
-
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-
-	spin_lock_bh(&sta->lock);
-
-	if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
-		spin_unlock_bh(&sta->lock);
-		goto addba_resp_exit;
-	}
-
-	if (mgmt->u.action.u.addba_resp.dialog_token !=
-		sta->ampdu_mlme.tid_tx[tid]->dialog_token) {
-		spin_unlock_bh(&sta->lock);
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-		goto addba_resp_exit;
-	}
-
-	del_timer_sync(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "switched off addBA timer for tid %d \n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-	if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
-			== WLAN_STATUS_SUCCESS) {
-		*state |= HT_ADDBA_RECEIVED_MSK;
-		sta->ampdu_mlme.addba_req_num[tid] = 0;
-
-		if (*state == HT_AGG_STATE_OPERATIONAL)
-			ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
-
-		spin_unlock_bh(&sta->lock);
-	} else {
-		sta->ampdu_mlme.addba_req_num[tid]++;
-		/* this will allow the state check in stop_BA_session */
-		*state = HT_AGG_STATE_OPERATIONAL;
-		spin_unlock_bh(&sta->lock);
-		ieee80211_stop_tx_ba_session(hw, sta->addr, tid,
-					     WLAN_BACK_INITIATOR);
-	}
-
-addba_resp_exit:
-	rcu_read_unlock();
-}
-
-static void ieee80211_sta_process_delba(struct ieee80211_sub_if_data *sdata,
-			struct ieee80211_mgmt *mgmt, size_t len)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sta_info *sta;
-	u16 tid, params;
-	u16 initiator;
-	DECLARE_MAC_BUF(mac);
-
-	rcu_read_lock();
-
-	sta = sta_info_get(local, mgmt->sa);
-	if (!sta) {
-		rcu_read_unlock();
-		return;
-	}
-
-	params = le16_to_cpu(mgmt->u.action.u.delba.params);
-	tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12;
-	initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11;
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	if (net_ratelimit())
-		printk(KERN_DEBUG "delba from %s (%s) tid %d reason code %d\n",
-			print_mac(mac, mgmt->sa),
-			initiator ? "initiator" : "recipient", tid,
-			mgmt->u.action.u.delba.reason_code);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-
-	if (initiator == WLAN_BACK_INITIATOR)
-		ieee80211_sta_stop_rx_ba_session(sdata, sta->addr, tid,
-						 WLAN_BACK_INITIATOR, 0);
-	else { /* WLAN_BACK_RECIPIENT */
-		spin_lock_bh(&sta->lock);
-		sta->ampdu_mlme.tid_state_tx[tid] =
-				HT_AGG_STATE_OPERATIONAL;
-		spin_unlock_bh(&sta->lock);
-		ieee80211_stop_tx_ba_session(&local->hw, sta->addr, tid,
-					     WLAN_BACK_RECIPIENT);
-	}
-	rcu_read_unlock();
-}
-
 static void ieee80211_sta_process_measurement_req(struct ieee80211_sub_if_data *sdata,
 						struct ieee80211_mgmt *mgmt,
 						size_t len)
@@ -2207,28 +1889,6 @@ static void ieee80211_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
 			break;
 		}
 		break;
-	case WLAN_CATEGORY_BACK:
-		switch (mgmt->u.action.u.addba_req.action_code) {
-		case WLAN_ACTION_ADDBA_REQ:
-			if (len < (IEEE80211_MIN_ACTION_SIZE +
-				   sizeof(mgmt->u.action.u.addba_req)))
-				break;
-			ieee80211_sta_process_addba_request(local, mgmt, len);
-			break;
-		case WLAN_ACTION_ADDBA_RESP:
-			if (len < (IEEE80211_MIN_ACTION_SIZE +
-				   sizeof(mgmt->u.action.u.addba_resp)))
-				break;
-			ieee80211_sta_process_addba_resp(local, mgmt, len);
-			break;
-		case WLAN_ACTION_DELBA:
-			if (len < (IEEE80211_MIN_ACTION_SIZE +
-				   sizeof(mgmt->u.action.u.delba)))
-				break;
-			ieee80211_sta_process_delba(sdata, mgmt, len);
-			break;
-		}
-		break;
 	case PLINK_CATEGORY:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
 			mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 7e09b30dd39..71cce0bcc5b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1510,6 +1510,65 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev);
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+	int len = rx->skb->len;
+
+	if (!ieee80211_is_action(mgmt->frame_control))
+		return RX_CONTINUE;
+
+	if (!rx->sta)
+		return RX_DROP_MONITOR;
+
+	if (!(rx->flags & IEEE80211_RX_RA_MATCH))
+		return RX_DROP_MONITOR;
+
+	/* all categories we currently handle have action_code */
+	if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+		return RX_DROP_MONITOR;
+
+	/*
+	 * FIXME: revisit this, I'm sure we should handle most
+	 *	  of these frames in other modes as well!
+	 */
+	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
+	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
+		return RX_DROP_MONITOR;
+
+	switch (mgmt->u.action.category) {
+	case WLAN_CATEGORY_BACK:
+		switch (mgmt->u.action.u.addba_req.action_code) {
+		case WLAN_ACTION_ADDBA_REQ:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.addba_req)))
+				return RX_DROP_MONITOR;
+			ieee80211_process_addba_request(local, rx->sta, mgmt, len);
+			break;
+		case WLAN_ACTION_ADDBA_RESP:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.addba_resp)))
+				return RX_DROP_MONITOR;
+			ieee80211_process_addba_resp(local, rx->sta, mgmt, len);
+			break;
+		case WLAN_ACTION_DELBA:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.delba)))
+				return RX_DROP_MONITOR;
+			ieee80211_process_delba(sdata, rx->sta, mgmt, len);
+			break;
+		}
+		rx->sta->rx_packets++;
+		dev_kfree_skb(rx->skb);
+		return RX_QUEUED;
+	}
+
+	return RX_CONTINUE;
+}
+
 static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 {
@@ -1689,6 +1748,7 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
 		CALL_RXH(ieee80211_rx_h_mesh_fwding);
 	CALL_RXH(ieee80211_rx_h_data)
 	CALL_RXH(ieee80211_rx_h_ctrl)
+	CALL_RXH(ieee80211_rx_h_action)
 	CALL_RXH(ieee80211_rx_h_mgmt)
 
 #undef CALL_RXH
-- 
cgit v1.2.3


From 39192c0bcf556c8521dcf0203714e9d48ac0b9f6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 14:49:03 +0200
Subject: mac80211: move spectrum management code out

Like the HT code, this doesn't depend on the STA-mode implementation
and can be handled entirely independently. There's only stub code
for now, but when it gets filled having it in its own file will be
beneficial.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/Makefile      |  1 +
 net/mac80211/ieee80211_i.h |  5 +++
 net/mac80211/mlme.c        | 89 +++-------------------------------------------
 net/mac80211/rx.c          | 22 +++++++++---
 net/mac80211/spectmgmt.c   | 86 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 114 insertions(+), 89 deletions(-)
 create mode 100644 net/mac80211/spectmgmt.c

(limited to 'net')

diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 1a7ac50910c..2dc8f2bff27 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -17,6 +17,7 @@ mac80211-y := \
 	aes_ccm.o \
 	cfg.o \
 	rx.o \
+	spectmgmt.o \
 	tx.o \
 	key.o \
 	util.o \
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 60ec7ad2764..b2ca9e6122c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -984,6 +984,11 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len);
 
+/* Spectrum management */
+void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
+				       struct ieee80211_mgmt *mgmt,
+				       size_t len);
+
 /* utility functions/constants */
 extern void *mac80211_wiphy_privid; /* for wiphy privid */
 extern const unsigned char rfc1042_header[6];
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 25f90f7ccf3..f1ee9d22cf4 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -445,53 +445,6 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	ieee80211_sta_tx(sdata, skb, 0);
 }
 
-static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
-					struct ieee80211_msrment_ie *request_ie,
-					const u8 *da, const u8 *bssid,
-					u8 dialog_token)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *msr_report;
-
-	skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom +
-				sizeof(struct ieee80211_msrment_ie));
-
-	if (!skb) {
-		printk(KERN_ERR "%s: failed to allocate buffer for "
-				"measurement report frame\n", sdata->dev->name);
-		return;
-	}
-
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-	msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24);
-	memset(msr_report, 0, 24);
-	memcpy(msr_report->da, da, ETH_ALEN);
-	memcpy(msr_report->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(msr_report->bssid, bssid, ETH_ALEN);
-	msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-						IEEE80211_STYPE_ACTION);
-
-	skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
-	msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
-	msr_report->u.action.u.measurement.action_code =
-				WLAN_ACTION_SPCT_MSR_RPRT;
-	msr_report->u.action.u.measurement.dialog_token = dialog_token;
-
-	msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT;
-	msr_report->u.action.u.measurement.length =
-			sizeof(struct ieee80211_msrment_ie);
-
-	memset(&msr_report->u.action.u.measurement.msr_elem, 0,
-		sizeof(struct ieee80211_msrment_ie));
-	msr_report->u.action.u.measurement.msr_elem.token = request_ie->token;
-	msr_report->u.action.u.measurement.msr_elem.mode |=
-			IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
-	msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
-
-	ieee80211_sta_tx(sdata, skb, 0);
-}
-
 /* MLME */
 static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_sta_bss *bss)
@@ -1011,24 +964,6 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 			    elems.challenge_len + 2, 1);
 }
 
-static void ieee80211_sta_process_measurement_req(struct ieee80211_sub_if_data *sdata,
-						struct ieee80211_mgmt *mgmt,
-						size_t len)
-{
-	/*
-	 * Ignoring measurement request is spec violation.
-	 * Mandatory measurements must be reported optional
-	 * measurements might be refused or reported incapable
-	 * For now just refuse
-	 * TODO: Answer basic measurement as unmeasured
-	 */
-	ieee80211_send_refuse_measurement_request(sdata,
-			&mgmt->u.action.u.measurement.msr_elem,
-			mgmt->sa, mgmt->bssid,
-			mgmt->u.action.u.measurement.dialog_token);
-}
-
-
 static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta,
 				   struct ieee80211_mgmt *mgmt,
@@ -1870,32 +1805,16 @@ static void ieee80211_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
 				     size_t len,
 				     struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_local *local = sdata->local;
-
-	/* all categories we currently handle have action_code */
-	if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+	/* currently we only handle mesh interface action frames here */
+	if (!ieee80211_vif_is_mesh(&sdata->vif))
 		return;
 
 	switch (mgmt->u.action.category) {
-	case WLAN_CATEGORY_SPECTRUM_MGMT:
-		if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
-			break;
-		switch (mgmt->u.action.u.measurement.action_code) {
-		case WLAN_ACTION_SPCT_MSR_REQ:
-			if (len < (IEEE80211_MIN_ACTION_SIZE +
-				   sizeof(mgmt->u.action.u.measurement)))
-				break;
-			ieee80211_sta_process_measurement_req(sdata, mgmt, len);
-			break;
-		}
-		break;
 	case PLINK_CATEGORY:
-		if (ieee80211_vif_is_mesh(&sdata->vif))
-			mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
+		mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
 		break;
 	case MESH_PATH_SEL_CATEGORY:
-		if (ieee80211_vif_is_mesh(&sdata->vif))
-			mesh_rx_path_sel_frame(sdata, mgmt, len);
+		mesh_rx_path_sel_frame(sdata, mgmt, len);
 		break;
 	}
 }
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 71cce0bcc5b..d00ace78bf8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1561,12 +1561,26 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			ieee80211_process_delba(sdata, rx->sta, mgmt, len);
 			break;
 		}
-		rx->sta->rx_packets++;
-		dev_kfree_skb(rx->skb);
-		return RX_QUEUED;
+		break;
+	case WLAN_CATEGORY_SPECTRUM_MGMT:
+		if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
+			return RX_DROP_MONITOR;
+		switch (mgmt->u.action.u.measurement.action_code) {
+		case WLAN_ACTION_SPCT_MSR_REQ:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.measurement)))
+				return RX_DROP_MONITOR;
+			ieee80211_process_measurement_req(sdata, mgmt, len);
+			break;
+		}
+		break;
+	default:
+		return RX_CONTINUE;
 	}
 
-	return RX_CONTINUE;
+	rx->sta->rx_packets++;
+	dev_kfree_skb(rx->skb);
+	return RX_QUEUED;
 }
 
 static ieee80211_rx_result debug_noinline
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
new file mode 100644
index 00000000000..b7129f4ae0c
--- /dev/null
+++ b/net/mac80211/spectmgmt.c
@@ -0,0 +1,86 @@
+/*
+ * spectrum management
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007  Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2008, Intel Corporation
+ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <net/wireless.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+#include "wme.h"
+
+static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_msrment_ie *request_ie,
+					const u8 *da, const u8 *bssid,
+					u8 dialog_token)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *msr_report;
+
+	skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom +
+				sizeof(struct ieee80211_msrment_ie));
+
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer for "
+				"measurement report frame\n", sdata->dev->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24);
+	memset(msr_report, 0, 24);
+	memcpy(msr_report->da, da, ETH_ALEN);
+	memcpy(msr_report->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(msr_report->bssid, bssid, ETH_ALEN);
+	msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
+	msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
+	msr_report->u.action.u.measurement.action_code =
+				WLAN_ACTION_SPCT_MSR_RPRT;
+	msr_report->u.action.u.measurement.dialog_token = dialog_token;
+
+	msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT;
+	msr_report->u.action.u.measurement.length =
+			sizeof(struct ieee80211_msrment_ie);
+
+	memset(&msr_report->u.action.u.measurement.msr_elem, 0,
+		sizeof(struct ieee80211_msrment_ie));
+	msr_report->u.action.u.measurement.msr_elem.token = request_ie->token;
+	msr_report->u.action.u.measurement.msr_elem.mode |=
+			IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
+	msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
+
+	ieee80211_sta_tx(sdata, skb, 0);
+}
+
+void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
+				       struct ieee80211_mgmt *mgmt,
+				       size_t len)
+{
+	/*
+	 * Ignoring measurement request is spec violation.
+	 * Mandatory measurements must be reported optional
+	 * measurements might be refused or reported incapable
+	 * For now just refuse
+	 * TODO: Answer basic measurement as unmeasured
+	 */
+	ieee80211_send_refuse_measurement_request(sdata,
+			&mgmt->u.action.u.measurement.msr_elem,
+			mgmt->sa, mgmt->bssid,
+			mgmt->u.action.u.measurement.dialog_token);
+}
-- 
cgit v1.2.3


From 759ef3eb1eeba8ff7411771e7b9cf6bfd6bb9cfe Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 14:55:09 +0200
Subject: mac80211: make ieee80211_rx_h_mgmt more readable

That function isn't exactly easy to read especially since it
does something in an if branch that continues after the if
because the else returns. Express it in a more readable way.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index d00ace78bf8..d0803797902 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1586,20 +1586,20 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev);
 
 	if (!(rx->flags & IEEE80211_RX_RA_MATCH))
 		return RX_DROP_MONITOR;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev);
-	if ((sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	     sdata->vif.type == IEEE80211_IF_TYPE_IBSS ||
-	     sdata->vif.type == IEEE80211_IF_TYPE_MESH_POINT) &&
-	    !(sdata->flags & IEEE80211_SDATA_USERSPACE_MLME))
-		ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
-	else
+	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
+	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
+	    sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT)
+		return RX_DROP_MONITOR;
+
+	if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
 		return RX_DROP_MONITOR;
 
+	ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
 	return RX_QUEUED;
 }
 
-- 
cgit v1.2.3


From e50db65c0dad109aae77c353305853b31555b228 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Sep 2008 15:07:09 +0200
Subject: mac80211: move frame TX function

The ieee80211_sta_tx function isn't MLME code any more,
it's getting used by a lot of code. Move it to utils and
rename it to ieee80211_tx_skb.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ht.c          |  8 ++++----
 net/mac80211/ieee80211_i.h |  4 ++--
 net/mac80211/mesh_hwmp.c   |  4 ++--
 net/mac80211/mesh_plink.c  |  2 +-
 net/mac80211/mlme.c        | 24 +++++-------------------
 net/mac80211/scan.c        |  2 +-
 net/mac80211/spectmgmt.c   |  2 +-
 net/mac80211/util.c        | 14 ++++++++++++++
 8 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 7e93e1079ad..4dc35c9dabc 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -113,7 +113,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
 	mgmt->u.action.u.addba_req.start_seq_num =
 					cpu_to_le16(start_seq_num << 4);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
@@ -159,7 +159,7 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
 	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
 	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 static void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
@@ -202,7 +202,7 @@ static void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
 	mgmt->u.action.u.delba.params = cpu_to_le16(params);
 	mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn)
@@ -231,7 +231,7 @@ void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u1
 	bar->control = cpu_to_le16(bar_control);
 	bar->start_seq_num = cpu_to_le16(ssn);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b2ca9e6122c..6f334e4c3d6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -912,8 +912,6 @@ u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
 u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 			    struct ieee802_11_elems *elems,
 			    enum ieee80211_band band);
-void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-		int encrypt);
 void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 			      u8 *ssid, size_t ssid_len);
 void ieee802_11_parse_elems(u8 *start, size_t len,
@@ -1000,6 +998,8 @@ int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
 void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
 				     struct ieee80211_hdr *hdr);
 void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata);
+void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		      int encrypt);
 
 #ifdef CONFIG_MAC80211_NOINLINE
 #define debug_noinline noinline
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 59fd7fe377e..210d6b85240 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -149,7 +149,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 	pos += ETH_ALEN;
 	memcpy(pos, &dst_dsn, 4);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 	return 0;
 }
 
@@ -198,7 +198,7 @@ int mesh_path_error_tx(u8 *dst, __le32 dst_dsn, u8 *ra,
 	pos += ETH_ALEN;
 	memcpy(pos, &dst_dsn, 4);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 	return 0;
 }
 
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 74983cfa729..7356462dee9 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -217,7 +217,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
 		memcpy(pos, &reason, 2);
 	}
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 	return 0;
 }
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f1ee9d22cf4..2c06f6965b7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -96,20 +96,6 @@ static int ieee80211_compatible_rates(struct ieee80211_sta_bss *bss,
 }
 
 /* frame sending functions */
-void ieee80211_sta_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-		      int encrypt)
-{
-	skb->dev = sdata->local->mdev;
-	skb_set_mac_header(skb, 0);
-	skb_set_network_header(skb, 0);
-	skb_set_transport_header(skb, 0);
-
-	skb->iif = sdata->dev->ifindex;
-	skb->do_not_encrypt = !encrypt;
-
-	dev_queue_xmit(skb);
-}
-
 static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 				struct ieee80211_if_sta *ifsta,
 				int transaction, u8 *extra, size_t extra_len,
@@ -144,7 +130,7 @@ static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	if (extra)
 		memcpy(skb_put(skb, extra_len), extra, extra_len);
 
-	ieee80211_sta_tx(sdata, skb, encrypt);
+	ieee80211_tx_skb(sdata, skb, encrypt);
 }
 
 void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
@@ -204,7 +190,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 		*pos = rate->bitrate / 5;
 	}
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
@@ -412,7 +398,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	if (ifsta->assocreq_ies)
 		memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 
@@ -442,7 +428,7 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	/* u.deauth.reason_code == u.disassoc.reason_code */
 	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 /* MLME */
@@ -1796,7 +1782,7 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
 	printk(KERN_DEBUG "%s: Sending ProbeResp to %s\n",
 	       sdata->dev->name, print_mac(mac, resp->da));
 #endif /* CONFIG_MAC80211_IBSS_DEBUG */
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 static void ieee80211_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 9f612015012..010781b806f 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -421,7 +421,7 @@ static void ieee80211_send_nullfunc(struct ieee80211_local *local,
 	memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
 	memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN);
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index b7129f4ae0c..f72bad636d8 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -65,7 +65,7 @@ static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_da
 			IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
 	msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
 
-	ieee80211_sta_tx(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, 0);
 }
 
 void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 55be3ef5c75..c3a22ab2ad2 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -598,3 +598,17 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
 	for (i = 0; i < local_to_hw(local)->queues; i++)
 		local->ops->conf_tx(local_to_hw(local), i, &qparam);
 }
+
+void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		      int encrypt)
+{
+	skb->dev = sdata->local->mdev;
+	skb_set_mac_header(skb, 0);
+	skb_set_network_header(skb, 0);
+	skb_set_transport_header(skb, 0);
+
+	skb->iif = sdata->dev->ifindex;
+	skb->do_not_encrypt = !encrypt;
+
+	dev_queue_xmit(skb);
+}
-- 
cgit v1.2.3


From aee14ceb5230afb5c17a4e28222ab9734ffd5002 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Tue, 9 Sep 2008 16:33:15 +0200
Subject: mac80211: Reorder debugfs calls during netdev deinit

ieee80211_free_keys() must be called before
ieee80211_debugfs_remove_netdev() in order to make sure that the
possible default_key symlink is removed before attempting to
remove the netdev debugfs directory.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/iface.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 4a623b8e91f..672cec60a2f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -31,11 +31,11 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 	int flushed;
 	int i;
 
-	ieee80211_debugfs_remove_netdev(sdata);
-
 	/* free extra data */
 	ieee80211_free_keys(sdata);
 
+	ieee80211_debugfs_remove_netdev(sdata);
+
 	for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++)
 		__skb_queue_purge(&sdata->fragments[i].skb_list);
 	sdata->fragment_next = 0;
-- 
cgit v1.2.3


From 1ae4be22f64326a6784acd7083b9590c9f4215a2 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 11 Sep 2008 20:17:05 -0700
Subject: vlan: vlan device not reading gso max size of parent.

The vlan devices are not reading the gso max size of the parent device.  As
a result devices that do not support 64K max gso size are currently
failing.

This issue is seen on 2.6.26 kernels as well and the same patch should be
able to be applied without any issues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c     | 1 +
 net/8021q/vlan_dev.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index b661f47bf10..f0e335aa20d 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -394,6 +394,7 @@ static void vlan_transfer_features(struct net_device *dev,
 
 	vlandev->features &= ~dev->vlan_features;
 	vlandev->features |= dev->features & dev->vlan_features;
+	vlandev->gso_max_size = dev->gso_max_size;
 
 	if (old_features != vlandev->features)
 		netdev_features_change(vlandev);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4bf014e51f8..97688cdb550 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -607,6 +607,7 @@ static int vlan_dev_init(struct net_device *dev)
 		      (1<<__LINK_STATE_PRESENT);
 
 	dev->features |= real_dev->features & real_dev->vlan_features;
+	dev->gso_max_size = real_dev->gso_max_size;
 
 	/* ipv6 shared card related stuff */
 	dev->dev_id = real_dev->dev_id;
-- 
cgit v1.2.3


From f262b59becc3f557da6460232abac13706402849 Mon Sep 17 00:00:00 2001
From: Benjamin Thery <benjamin.thery@bull.net>
Date: Fri, 12 Sep 2008 16:16:37 -0700
Subject: net: fix scheduling of dst_gc_task by __dst_free

The dst garbage collector dst_gc_task() may not be scheduled as we
expect it to be in __dst_free().

Indeed, when the dst_gc_timer was replaced by the delayed_work
dst_gc_work, the mod_timer() call used to schedule the garbage
collector at an earlier date was replaced by a schedule_delayed_work()
(see commit 86bba269d08f0c545ae76c90b56727f65d62d57f).

But, the behaviour of mod_timer() and schedule_delayed_work() is
different in the way they handle the delay.

mod_timer() stops the timer and re-arm it with the new given delay,
whereas schedule_delayed_work() only check if the work is already
queued in the workqueue (and queue it (with delay) if it is not)
BUT it does NOT take into account the new delay (even if the new delay
is earlier in time).
schedule_delayed_work() returns 0 if it didn't queue the work,
but we don't check the return code in __dst_free().

If I understand the code in __dst_free() correctly, we want dst_gc_task
to be queued after DST_GC_INC jiffies if we pass the test (and not in
some undetermined time in the future), so I think we should add a call
to cancel_delayed_work() before schedule_delayed_work(). Patch below.

Or we should at least test the return code of schedule_delayed_work(),
and reset the values of dst_garbage.timer_inc and dst_garbage.timer_expires
back to their former values if schedule_delayed_work() failed.
Otherwise the subsequent calls to __dst_free will test the wrong values
and assume wrong thing about when the garbage collector is supposed to
be scheduled.

dst_gc_task() also calls schedule_delayed_work() without checking
its return code (or calling cancel_scheduled_work() first), but it
should fine there: dst_gc_task is the routine of the delayed_work, so
no dst_gc_work should be pending in the queue when it's running.

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Acked-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/dst.c b/net/core/dst.c
index fe03266130b..09c1530f468 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -203,6 +203,7 @@ void __dst_free(struct dst_entry * dst)
 	if (dst_garbage.timer_inc > DST_GC_INC) {
 		dst_garbage.timer_inc = DST_GC_INC;
 		dst_garbage.timer_expires = DST_GC_MIN;
+		cancel_delayed_work(&dst_gc_work);
 		schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires);
 	}
 	spin_unlock_bh(&dst_garbage.lock);
-- 
cgit v1.2.3


From 78d15e82754945ee9821fb491b57faf43abfb9d7 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 12 Sep 2008 16:17:43 -0700
Subject: tcp_ipv6: fix use of uninitialized memory

inet6_rsk() is called on a struct request_sock * before we
have checked whether the socket is an ipv6 socket or a ipv6-
mapped ipv4 socket. The access that triggers this is the
inet_rsk(rsk)->inet6_rsk_offset dereference in inet6_rsk().

This is arguably not a critical error as the inet6_rsk_offset
is only used to compute a pointer which is never really used
(in the code path in question) anyway. But it might be a
latent error, so let's fix it.

Spotted by kmemcheck.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b585c850a89..e85f377a8f8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1286,7 +1286,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 					  struct request_sock *req,
 					  struct dst_entry *dst)
 {
-	struct inet6_request_sock *treq = inet6_rsk(req);
+	struct inet6_request_sock *treq;
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct tcp6_sock *newtcp6sk;
 	struct inet_sock *newinet;
@@ -1350,6 +1350,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		return newsk;
 	}
 
+	treq = inet6_rsk(req);
 	opt = np->opt;
 
 	if (sk_acceptq_is_full(sk))
-- 
cgit v1.2.3


From 92651940ab00dbe64722e908f70d816713d677b7 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 12 Sep 2008 16:29:34 -0700
Subject: pkt_sched: Add multiqueue scheduler support

This patch is intended to add a qdisc to support the new tx multiqueue
architecture by providing a band for each hardware queue.  By doing
this it is possible to support a different qdisc per physical hardware
queue.

This qdisc uses the skb->queue_mapping to select which band to place
the traffic onto.  It then uses a round robin w/ a check to see if the
subqueue is stopped to determine which band to dequeue the packet from.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig      |   9 +
 net/sched/Makefile     |   1 +
 net/sched/sch_multiq.c | 467 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 477 insertions(+)
 create mode 100644 net/sched/sch_multiq.c

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9437b27ff84..efaa7a75e7f 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -106,6 +106,15 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_MULTIQ
+	tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+	---help---
+	  Say Y here if you want to use an n-band queue packet scheduler
+	  to support devices that have multiple hardware transmit queues.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_multiq.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 1d2b0f7df84..3d9b953f7f6 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
 obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 00000000000..49a8b67ed3b
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct multiq_sched_data {
+	u16 bands;
+	u16 max_bands;
+	u16 curband;
+	struct tcf_proto *filter_list;
+	struct Qdisc **queues;
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	u32 band;
+	struct tcf_result res;
+	int err;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	err = tc_classify(skb, q->filter_list, &res);
+#ifdef CONFIG_NET_CLS_ACT
+	switch (err) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+	case TC_ACT_SHOT:
+		return NULL;
+	}
+#endif
+	band = skb_get_queue_mapping(skb);
+
+	if (band >= q->bands)
+		return q->queues[0];
+
+	return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->bstats.bytes += qdisc_pkt_len(skb);
+		sch->bstats.packets++;
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static int
+multiq_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc->ops->requeue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		sch->qstats.requeues++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid excessive requeues
+		 */
+		if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
+		}
+	}
+	return NULL;
+
+}
+
+static unsigned int multiq_drop(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+	unsigned int len;
+	struct Qdisc *qdisc;
+
+	for (band = q->bands-1; band >= 0; band--) {
+		qdisc = q->queues[band];
+		if (qdisc->ops->drop) {
+			len = qdisc->ops->drop(qdisc);
+			if (len != 0) {
+				sch->q.qlen--;
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+	u16 band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	for (band = 0; band < q->bands; band++)
+		qdisc_reset(q->queues[band]);
+	sch->q.qlen = 0;
+	q->curband = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	for (band = 0; band < q->bands; band++)
+		qdisc_destroy(q->queues[band]);
+
+	kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tc_multiq_qopt *qopt;
+	int i;
+
+	if (!netif_is_multiqueue(qdisc_dev(sch)))
+		return -EINVAL;
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+
+	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+	sch_tree_lock(sch);
+	q->bands = qopt->bands;
+	for (i = q->bands; i < q->max_bands; i++) {
+		struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
+		if (child != &noop_qdisc) {
+			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_destroy(child);
+		}
+	}
+
+	sch_tree_unlock(sch);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child;
+			child = qdisc_create_dflt(qdisc_dev(sch),
+						  sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle,
+							    i + 1));
+			if (child) {
+				sch_tree_lock(sch);
+				child = xchg(&q->queues[i], child);
+
+				if (child != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(child,
+								 child->q.qlen);
+					qdisc_destroy(child);
+				}
+				sch_tree_unlock(sch);
+			}
+		}
+	}
+	return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	q->queues = NULL;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	q->max_bands = qdisc_dev(sch)->num_tx_queues;
+
+	q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
+	if (!q->queues)
+		return -ENOBUFS;
+	for (i = 0; i < q->max_bands; i++)
+		q->queues[i] = &noop_qdisc;
+
+	return multiq_tune(sch, opt);
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_multiq_qopt opt;
+
+	opt.bands = q->bands;
+	opt.max_bands = q->max_bands;
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		      struct Qdisc **old)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return -EINVAL;
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->queues[band];
+	q->queues[band] = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return NULL;
+
+	return q->queues[band];
+}
+
+static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = TC_H_MIN(classid);
+
+	if (band - 1 >= q->bands)
+		return 0;
+	return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return multiq_get(sch, classid);
+}
+
+
+static void multiq_put(struct Qdisc *q, unsigned long cl)
+{
+	return;
+}
+
+static int multiq_change(struct Qdisc *sch, u32 handle, u32 parent,
+			 struct nlattr **tca, unsigned long *arg)
+{
+	unsigned long cl = *arg;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+static int multiq_delete(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	if (q->queues[cl-1])
+		tcm->tcm_info = q->queues[cl-1]->handle;
+	return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *cl_q;
+
+	cl_q = q->queues[cl - 1];
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+
+	if (arg->stop)
+		return;
+
+	for (band = 0; band < q->bands; band++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, band+1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+	.graft		=	multiq_graft,
+	.leaf		=	multiq_leaf,
+	.get		=	multiq_get,
+	.put		=	multiq_put,
+	.change		=	multiq_change,
+	.delete		=	multiq_delete,
+	.walk		=	multiq_walk,
+	.tcf_chain	=	multiq_find_tcf,
+	.bind_tcf	=	multiq_bind,
+	.unbind_tcf	=	multiq_put,
+	.dump		=	multiq_dump_class,
+	.dump_stats	=	multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&multiq_class_ops,
+	.id		=	"multiq",
+	.priv_size	=	sizeof(struct multiq_sched_data),
+	.enqueue	=	multiq_enqueue,
+	.dequeue	=	multiq_dequeue,
+	.requeue	=	multiq_requeue,
+	.drop		=	multiq_drop,
+	.init		=	multiq_init,
+	.reset		=	multiq_reset,
+	.destroy	=	multiq_destroy,
+	.change		=	multiq_tune,
+	.dump		=	multiq_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+	return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+	unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From ca9b0e27e072be4cef2f5f0cbc0b0fd94eae3520 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 12 Sep 2008 16:30:20 -0700
Subject: pkt_action: add new action skbedit

This new action will have the ability to change the priority and/or
queue_mapping fields on an sk_buff.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig       |  11 +++
 net/sched/Makefile      |   1 +
 net/sched/act_skbedit.c | 203 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 215 insertions(+)
 create mode 100644 net/sched/act_skbedit.c

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index efaa7a75e7f..6767e54155d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -485,6 +485,17 @@ config NET_ACT_SIMP
 	  To compile this code as a module, choose M here: the
 	  module will be called simple.
 
+config NET_ACT_SKBEDIT
+        tristate "SKB Editing"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to change skb priority or queue_mapping settings.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called skbedit.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 3d9b953f7f6..e60c9925b26 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
 obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
+obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
new file mode 100644
index 00000000000..fe9777e77f3
--- /dev/null
+++ b/net/sched/act_skbedit.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_skbedit.h>
+
+#define SKBEDIT_TAB_MASK     15
+static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1];
+static u32 skbedit_idx_gen;
+static DEFINE_RWLOCK(skbedit_lock);
+
+static struct tcf_hashinfo skbedit_hash_info = {
+	.htab	=	tcf_skbedit_ht,
+	.hmask	=	SKBEDIT_TAB_MASK,
+	.lock	=	&skbedit_lock,
+};
+
+static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	d->tcf_bstats.bytes += qdisc_pkt_len(skb);
+	d->tcf_bstats.packets++;
+
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
+	    skb->dev->real_num_tx_queues > d->queue_mapping)
+		skb_set_queue_mapping(skb, d->queue_mapping);
+
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
+	[TCA_SKBEDIT_PARMS]		= { .len = sizeof(struct tc_skbedit) },
+	[TCA_SKBEDIT_PRIORITY]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_QUEUE_MAPPING]	= { .len = sizeof(u16) },
+};
+
+static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+	struct tc_skbedit *parm;
+	struct tcf_skbedit *d;
+	struct tcf_common *pc;
+	u32 flags = 0, *priority = NULL;
+	u16 *queue_mapping = NULL;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_SKBEDIT_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
+		flags |= SKBEDIT_F_PRIORITY;
+		priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
+	}
+
+	if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+		flags |= SKBEDIT_F_QUEUE_MAPPING;
+		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
+	}
+	if (!flags)
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
+				     &skbedit_idx_gen, &skbedit_hash_info);
+		if (unlikely(!pc))
+			return -ENOMEM;
+
+		d = to_skbedit(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_skbedit(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &skbedit_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	spin_lock_bh(&d->tcf_lock);
+
+	d->flags = flags;
+	if (flags & SKBEDIT_F_PRIORITY)
+		d->priority = *priority;
+	if (flags & SKBEDIT_F_QUEUE_MAPPING)
+		d->queue_mapping = *queue_mapping;
+	d->tcf_action = parm->action;
+
+	spin_unlock_bh(&d->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &skbedit_hash_info);
+	return ret;
+}
+
+static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	if (d)
+		return tcf_hash_release(&d->common, bind, &skbedit_hash_info);
+	return 0;
+}
+
+static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+				int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbedit *d = a->priv;
+	struct tc_skbedit opt;
+	struct tcf_t t;
+
+	opt.index = d->tcf_index;
+	opt.refcnt = d->tcf_refcnt - ref;
+	opt.bindcnt = d->tcf_bindcnt - bind;
+	opt.action = d->tcf_action;
+	NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+			&d->priority);
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
+		NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+			sizeof(d->queue_mapping), &d->queue_mapping);
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_skbedit_ops = {
+	.kind		=	"skbedit",
+	.hinfo		=	&skbedit_hash_info,
+	.type		=	TCA_ACT_SKBEDIT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbedit,
+	.dump		=	tcf_skbedit_dump,
+	.cleanup	=	tcf_skbedit_cleanup,
+	.init		=	tcf_skbedit_init,
+	.walk		=	tcf_generic_walker,
+};
+
+MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
+MODULE_DESCRIPTION("SKB Editing");
+MODULE_LICENSE("GPL");
+
+static int __init skbedit_init_module(void)
+{
+	return tcf_register_action(&act_skbedit_ops);
+}
+
+static void __exit skbedit_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbedit_ops);
+}
+
+module_init(skbedit_init_module);
+module_exit(skbedit_cleanup_module);
-- 
cgit v1.2.3


From f07d1501292b3b0d3276ee0e537005526a45e242 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 12 Sep 2008 17:57:23 -0700
Subject: multiq: Further multiqueue cleanup

This patch resolves a few issues found with multiq including wording
suggestions and a problem seen in the allocation of queues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_multiq.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 49a8b67ed3b..5d9cd68e91d 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -214,8 +214,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 	sch_tree_lock(sch);
 	q->bands = qopt->bands;
 	for (i = q->bands; i < q->max_bands; i++) {
-		struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
-		if (child != &noop_qdisc) {
+		if (q->queues[i] != &noop_qdisc) {
+			struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
 			qdisc_tree_decrease_qlen(child, child->q.qlen);
 			qdisc_destroy(child);
 		}
@@ -250,7 +250,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
-	int i;
+	int i, err;
 
 	q->queues = NULL;
 
@@ -265,7 +265,12 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
 	for (i = 0; i < q->max_bands; i++)
 		q->queues[i] = &noop_qdisc;
 
-	return multiq_tune(sch, opt);
+	err = multiq_tune(sch,opt);
+
+	if (err)
+		kfree(q->queues);
+
+	return err;
 }
 
 static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
cgit v1.2.3


From 63f2c0464875b6ef2132cecb19b2a5abbf061227 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 12 Sep 2008 23:23:50 -0700
Subject: net: ip_vs_proto_{tcp,udp} build fix

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_proto_tcp.c | 1 +
 net/ipv4/ipvs/ip_vs_proto_udp.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 537f616776d..dd4566ea2bf 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -18,6 +18,7 @@
 #include <linux/tcp.h>                  /* for tcphdr */
 #include <net/ip.h>
 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/ip6_checksum.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index e3ee26bd1de..6eb6039d634 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -22,6 +22,7 @@
 
 #include <net/ip_vs.h>
 #include <net/ip.h>
+#include <net/ip6_checksum.h>
 
 static struct ip_vs_conn *
 udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-- 
cgit v1.2.3


From b2e1b30290539b344cbaff0d9da38012e03aa347 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Tue, 9 Sep 2008 23:19:48 -0700
Subject: cfg80211: Add new wireless regulatory infrastructure

This adds the new wireless regulatory infrastructure. The
main motiviation behind this was to centralize regulatory
code as each driver was implementing their own regulatory solution,
and to replace the initial centralized code we have where:

* only 3 regulatory domains are supported: US, JP and EU
* regulatory domains can only be changed through module parameter
* all rules were built statically in the kernel

We now have support for regulatory domains for many countries
and regulatory domains are now queried through a userspace agent
through udev allowing distributions to update regulatory rules
without updating the kernel.

Each driver can regulatory_hint() a regulatory domain
based on either their EEPROM mapped regulatory domain value to a
respective ISO/IEC 3166-1 country code or pass an internally built
regulatory domain. We also add support to let the user set the
regulatory domain through userspace in case of faulty EEPROMs to
further help compliance.

Support for world roaming will be added soon for cards capable of
this.

For more information see:

http://wireless.kernel.org/en/developers/Regulatory/CRDA

For now we leave an option to enable the old module parameter,
ieee80211_regdom, and to build the 3 old regdomains statically
(US, JP and EU). This option is CONFIG_WIRELESS_OLD_REGULATORY.
These old static definitions and the module parameter is being
scheduled for removal for 2.6.29. Note that if you use this
you won't make use of a world regulatory domain as its pointless.
If you leave this option enabled and if CRDA is present and you
use US or JP we will try to ask CRDA to update us a regulatory
domain for us.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c     |   7 +
 net/wireless/Kconfig   |  32 ++
 net/wireless/core.c    | 162 +++++++++-
 net/wireless/core.h    |   2 +-
 net/wireless/nl80211.c | 151 ++++++++++
 net/wireless/reg.c     | 805 ++++++++++++++++++++++++++++++++++++++++++-------
 net/wireless/reg.h     |  44 +++
 7 files changed, 1087 insertions(+), 116 deletions(-)
 create mode 100644 net/wireless/reg.h

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 928813ce08e..5a3bdaad6c1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -17,6 +17,13 @@
 #include "rate.h"
 #include "mesh.h"
 
+struct ieee80211_hw *wiphy_to_hw(struct wiphy *wiphy)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	return &local->hw;
+}
+EXPORT_SYMBOL(wiphy_to_hw);
+
 static enum ieee80211_if_types
 nl80211_type_to_mac80211_type(enum nl80211_iftype type)
 {
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 833b024f8f6..b97bd9fe6b7 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -14,6 +14,38 @@ config NL80211
 
 	  If unsure, say Y.
 
+config WIRELESS_OLD_REGULATORY
+	bool "Old wireless static regulatory defintions"
+	default n
+	---help---
+	  This option enables the old static regulatory information
+	  and uses it within the new framework. This is available
+	  temporarily as an option to help prevent immediate issues
+	  due to the switch to the new regulatory framework which
+	  does require a new userspace application which has the
+	  database of regulatory information (CRDA) and another for
+	  setting regulatory domains (iw).
+
+	  For more information see:
+
+	  http://wireless.kernel.org/en/developers/Regulatory/CRDA
+	  http://wireless.kernel.org/en/users/Documentation/iw
+
+	  It is important to note though that if you *do* have CRDA present
+	  and if this option is enabled CRDA *will* be called to update the
+	  regulatory domain (for US and JP only). Support for letting the user
+	  set the regulatory domain through iw is also supported. This option
+	  mainly exists to leave around for a kernel release some old static
+	  regulatory domains that were defined and to keep around the old
+	  ieee80211_regdom module parameter. This is being phased out and you
+	  should stop using them ASAP.
+
+	  Say N unless you cannot install a new userspace application
+	  or have one currently depending on the ieee80211_regdom module
+	  parameter and cannot port it to use the new userspace interfaces.
+
+	  This is scheduled for removal for 2.6.29.
+
 config WIRELESS_EXT
 	bool "Wireless extensions"
 	default n
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 7e995ac06a0..a910cd2d0fd 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -13,12 +13,14 @@
 #include <linux/debugfs.h>
 #include <linux/notifier.h>
 #include <linux/device.h>
+#include <linux/list.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>
 #include <net/wireless.h>
 #include "nl80211.h"
 #include "core.h"
 #include "sysfs.h"
+#include "reg.h"
 
 /* name for sysfs, %d is appended */
 #define PHY_NAME "phy"
@@ -27,6 +29,107 @@ MODULE_AUTHOR("Johannes Berg");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("wireless configuration support");
 
+struct list_head regulatory_requests;
+
+/* Central wireless core regulatory domains, we only need two,
+ * the current one and a world regulatory domain in case we have no
+ * information to give us an alpha2 */
+struct ieee80211_regdomain *cfg80211_regdomain;
+
+/* We keep a static world regulatory domain in case of the absence of CRDA */
+const struct ieee80211_regdomain world_regdom = {
+	.n_reg_rules = 1,
+	.alpha2 =  "00",
+	.reg_rules = {
+		REG_RULE(2402, 2472, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS),
+	}
+};
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+/* All this fucking static junk will be removed soon, so
+ * don't fucking count on it !@#$ */
+
+static char *ieee80211_regdom = "US";
+module_param(ieee80211_regdom, charp, 0444);
+MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
+
+/* We assume 40 MHz bandwidth for the old regulatory work.
+ * We make emphasis we are using the exact same frequencies
+ * as before */
+
+const struct ieee80211_regdomain us_regdom = {
+	.n_reg_rules = 6,
+	.alpha2 =  "US",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..11 */
+		REG_RULE(2412-20, 2462+20, 40, 6, 27, 0),
+		/* IEEE 802.11a, channel 36 */
+		REG_RULE(5180-20, 5180+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channel 40 */
+		REG_RULE(5200-20, 5200+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channel 44 */
+		REG_RULE(5220-20, 5220+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channels 48..64 */
+		REG_RULE(5240-20, 5320+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channels 149..165, outdoor */
+		REG_RULE(5745-20, 5825+20, 40, 6, 30, 0),
+	}
+};
+
+const struct ieee80211_regdomain jp_regdom = {
+	.n_reg_rules = 3,
+	.alpha2 =  "JP",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..14 */
+		REG_RULE(2412-20, 2484+20, 40, 6, 20, 0),
+		/* IEEE 802.11a, channels 34..48 */
+		REG_RULE(5170-20, 5240+20, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channels 52..64 */
+		REG_RULE(5260-20, 5320+20, 40, 6, 20,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+	}
+};
+
+const struct ieee80211_regdomain eu_regdom = {
+	.n_reg_rules = 6,
+	/* This alpha2 is bogus, we leave it here just for stupid
+	 * backward compatibility */
+	.alpha2 =  "EU",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..13 */
+		REG_RULE(2412-20, 2472+20, 40, 6, 20, 0),
+		/* IEEE 802.11a, channel 36 */
+		REG_RULE(5180-20, 5180+20, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channel 40 */
+		REG_RULE(5200-20, 5200+20, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channel 44 */
+		REG_RULE(5220-20, 5220+20, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channels 48..64 */
+		REG_RULE(5240-20, 5320+20, 40, 6, 20,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+		/* IEEE 802.11a, channels 100..140 */
+		REG_RULE(5500-20, 5700+20, 40, 6, 30,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+	}
+};
+
+#endif
+
+struct ieee80211_regdomain *cfg80211_world_regdom =
+	(struct ieee80211_regdomain *) &world_regdom;
+
+LIST_HEAD(regulatory_requests);
+DEFINE_MUTEX(cfg80211_reg_mutex);
+
 /* RCU might be appropriate here since we usually
  * only read the list, and that can happen quite
  * often because we need to do it for each command */
@@ -302,7 +405,9 @@ int wiphy_register(struct wiphy *wiphy)
 	ieee80211_set_bitrate_flags(wiphy);
 
 	/* set up regulatory info */
-	wiphy_update_regulatory(wiphy);
+	mutex_lock(&cfg80211_reg_mutex);
+	wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE);
+	mutex_unlock(&cfg80211_reg_mutex);
 
 	mutex_lock(&cfg80211_drv_mutex);
 
@@ -409,9 +514,35 @@ static struct notifier_block cfg80211_netdev_notifier = {
 	.notifier_call = cfg80211_netdev_notifier_call,
 };
 
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+const struct ieee80211_regdomain *static_regdom(char *alpha2)
+{
+	if (alpha2[0] == 'U' && alpha2[1] == 'S')
+		return &us_regdom;
+	if (alpha2[0] == 'J' && alpha2[1] == 'P')
+		return &jp_regdom;
+	if (alpha2[0] == 'E' && alpha2[1] == 'U')
+		return &eu_regdom;
+	/* Default, as per the old rules */
+	return &us_regdom;
+}
+#endif
+
 static int cfg80211_init(void)
 {
-	int err = wiphy_sysfs_init();
+	int err;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) static_regdom(ieee80211_regdom);
+	/* Used during reset_regdomains_static() */
+	cfg80211_world_regdom = cfg80211_regdomain;
+#else
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+#endif
+
+	err = wiphy_sysfs_init();
 	if (err)
 		goto out_fail_sysfs;
 
@@ -425,8 +556,33 @@ static int cfg80211_init(void)
 
 	ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL);
 
+	err = regulatory_init();
+	if (err)
+		goto out_fail_reg;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	printk(KERN_INFO "cfg80211: Using old static regulatory domain:\n");
+	print_regdomain_info(cfg80211_regdomain);
+	/* The old code still requests for a new regdomain and if
+	 * you have CRDA you get it updated, otherwise you get
+	 * stuck with the static values. We ignore "EU" code as
+	 * that is not a valid ISO / IEC 3166 alpha2 */
+	if (ieee80211_regdom[0] != 'E' &&
+			ieee80211_regdom[1] != 'U')
+		err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
+			ieee80211_regdom, NULL);
+#else
+	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", NULL);
+	if (err)
+		printk(KERN_ERR "cfg80211: calling CRDA failed - "
+			"unable to update world regulatory domain, "
+			"using static definition\n");
+#endif
+
 	return 0;
 
+out_fail_reg:
+	debugfs_remove(ieee80211_debugfs_dir);
 out_fail_nl80211:
 	unregister_netdevice_notifier(&cfg80211_netdev_notifier);
 out_fail_notifier:
@@ -434,6 +590,7 @@ out_fail_notifier:
 out_fail_sysfs:
 	return err;
 }
+
 subsys_initcall(cfg80211_init);
 
 static void cfg80211_exit(void)
@@ -442,5 +599,6 @@ static void cfg80211_exit(void)
 	nl80211_exit();
 	unregister_netdevice_notifier(&cfg80211_netdev_notifier);
 	wiphy_sysfs_exit();
+	regulatory_exit();
 }
 module_exit(cfg80211_exit);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 7a02c356d63..771cc5cc765 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -79,6 +79,6 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv,
 			       char *newname);
 
 void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
-void wiphy_update_regulatory(struct wiphy *wiphy);
+void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby);
 
 #endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 77880ba8b61..1221d726ed5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18,6 +18,7 @@
 #include <net/cfg80211.h>
 #include "core.h"
 #include "nl80211.h"
+#include "reg.h"
 
 /* the netlink family */
 static struct genl_family nl80211_fam = {
@@ -88,6 +89,9 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 				.len = IEEE80211_MAX_MESH_ID_LEN },
 	[NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 },
 
+	[NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 },
+	[NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED },
+
 	[NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
@@ -1599,6 +1603,141 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static const struct nla_policy
+	reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
+	[NL80211_ATTR_REG_RULE_FLAGS]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_START]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_END]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_MAX_BW]	= { .type = NLA_U32 },
+	[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]	= { .type = NLA_U32 },
+	[NL80211_ATTR_POWER_RULE_MAX_EIRP]	= { .type = NLA_U32 },
+};
+
+static int parse_reg_rule(struct nlattr *tb[],
+	struct ieee80211_reg_rule *reg_rule)
+{
+	struct ieee80211_freq_range *freq_range = &reg_rule->freq_range;
+	struct ieee80211_power_rule *power_rule = &reg_rule->power_rule;
+
+	if (!tb[NL80211_ATTR_REG_RULE_FLAGS])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_START])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_END])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP])
+		return -EINVAL;
+
+	reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]);
+
+	freq_range->start_freq_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]);
+	freq_range->end_freq_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]);
+	freq_range->max_bandwidth_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]);
+
+	power_rule->max_eirp =
+		nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]);
+
+	if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN])
+		power_rule->max_antenna_gain =
+			nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]);
+
+	return 0;
+}
+
+static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	int r;
+	char *data = NULL;
+
+	if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+		return -EINVAL;
+
+	data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	/* We ignore world regdom requests with the old regdom setup */
+	if (is_world_regdom(data))
+		return -EINVAL;
+#endif
+	mutex_lock(&cfg80211_drv_mutex);
+	r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, NULL);
+	mutex_unlock(&cfg80211_drv_mutex);
+	return r;
+}
+
+static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
+	struct nlattr *nl_reg_rule;
+	char *alpha2 = NULL;
+	int rem_reg_rules = 0, r = 0;
+	u32 num_rules = 0, rule_idx = 0, size_of_regd;
+	struct ieee80211_regdomain *rd = NULL;
+
+	if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_REG_RULES])
+		return -EINVAL;
+
+	alpha2 = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+
+	nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+			rem_reg_rules) {
+		num_rules++;
+		if (num_rules > NL80211_MAX_SUPP_REG_RULES)
+			goto bad_reg;
+	}
+
+	if (!reg_is_valid_request(alpha2))
+		return -EINVAL;
+
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
+		(num_rules * sizeof(struct ieee80211_reg_rule));
+
+	rd = kzalloc(size_of_regd, GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->n_reg_rules = num_rules;
+	rd->alpha2[0] = alpha2[0];
+	rd->alpha2[1] = alpha2[1];
+
+	nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+			rem_reg_rules) {
+		nla_parse(tb, NL80211_REG_RULE_ATTR_MAX,
+			nla_data(nl_reg_rule), nla_len(nl_reg_rule),
+			reg_rule_policy);
+		r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
+		if (r)
+			goto bad_reg;
+
+		rule_idx++;
+
+		if (rule_idx > NL80211_MAX_SUPP_REG_RULES)
+			goto bad_reg;
+	}
+
+	BUG_ON(rule_idx != num_rules);
+
+	mutex_lock(&cfg80211_drv_mutex);
+	r = set_regdom(rd);
+	mutex_unlock(&cfg80211_drv_mutex);
+	if (r)
+		goto bad_reg;
+
+	return r;
+
+bad_reg:
+	kfree(rd);
+	return -EINVAL;
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -1736,6 +1875,18 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_SET_REG,
+		.doit = nl80211_set_reg,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_REQ_SET_REG,
+		.doit = nl80211_req_set_reg,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 /* multicast groups */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 855bff4b325..592b2e391d4 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2,179 +2,758 @@
  * Copyright 2002-2005, Instant802 Networks, Inc.
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2008	Luis R. Rodriguez <lrodriguz@atheros.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
 
-/*
- * This regulatory domain control implementation is highly incomplete, it
- * only exists for the purpose of not regressing mac80211.
- *
- * For now, drivers can restrict the set of allowed channels by either
- * not registering those channels or setting the IEEE80211_CHAN_DISABLED
- * flag; that flag will only be *set* by this code, never *cleared.
+/**
+ * DOC: Wireless regulatory infrastructure
  *
  * The usual implementation is for a driver to read a device EEPROM to
  * determine which regulatory domain it should be operating under, then
  * looking up the allowable channels in a driver-local table and finally
  * registering those channels in the wiphy structure.
  *
- * Alternatively, drivers that trust the regulatory domain control here
- * will register a complete set of capabilities and the control code
- * will restrict the set by setting the IEEE80211_CHAN_* flags.
+ * Another set of compliance enforcement is for drivers to use their
+ * own compliance limits which can be stored on the EEPROM. The host
+ * driver or firmware may ensure these are used.
+ *
+ * In addition to all this we provide an extra layer of regulatory
+ * conformance. For drivers which do not have any regulatory
+ * information CRDA provides the complete regulatory solution.
+ * For others it provides a community effort on further restrictions
+ * to enhance compliance.
+ *
+ * Note: When number of rules --> infinity we will not be able to
+ * index on alpha2 any more, instead we'll probably have to
+ * rely on some SHA1 checksum of the regdomain for example.
+ *
  */
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/nl80211.h>
+#include <linux/platform_device.h>
 #include <net/wireless.h>
+#include <net/cfg80211.h>
 #include "core.h"
+#include "reg.h"
 
-static char *ieee80211_regdom = "US";
-module_param(ieee80211_regdom, charp, 0444);
-MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
+/* To trigger userspace events */
+static struct platform_device *reg_pdev;
 
-struct ieee80211_channel_range {
-	short start_freq;
-	short end_freq;
-	int max_power;
-	int max_antenna_gain;
-	u32 flags;
+/* Keep the ordering from large to small */
+static u32 supported_bandwidths[] = {
+	MHZ_TO_KHZ(40),
+	MHZ_TO_KHZ(20),
 };
 
-struct ieee80211_regdomain {
-	const char *code;
-	const struct ieee80211_channel_range *ranges;
-	int n_ranges;
-};
+bool is_world_regdom(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (alpha2[0] == '0' && alpha2[1] == '0')
+		return true;
+	return false;
+}
 
-#define RANGE_PWR(_start, _end, _pwr, _ag, _flags)	\
-	{ _start, _end, _pwr, _ag, _flags }
+static bool is_alpha2_set(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (alpha2[0] != 0 && alpha2[1] != 0)
+		return true;
+	return false;
+}
 
+static bool is_alpha_upper(char letter)
+{
+	/* ASCII A - Z */
+	if (letter >= 65 && letter <= 90)
+		return true;
+	return false;
+}
 
-/*
- * Ideally, in the future, these definitions will be loaded from a
- * userspace table via some daemon.
- */
-static const struct ieee80211_channel_range ieee80211_US_channels[] = {
-	/* IEEE 802.11b/g, channels 1..11 */
-	RANGE_PWR(2412, 2462, 27, 6, 0),
-	/* IEEE 802.11a, channel 36*/
-	RANGE_PWR(5180, 5180, 23, 6, 0),
-	/* IEEE 802.11a, channel 40*/
-	RANGE_PWR(5200, 5200, 23, 6, 0),
-	/* IEEE 802.11a, channel 44*/
-	RANGE_PWR(5220, 5220, 23, 6, 0),
-	/* IEEE 802.11a, channels 48..64 */
-	RANGE_PWR(5240, 5320, 23, 6, 0),
-	/* IEEE 802.11a, channels 149..165, outdoor */
-	RANGE_PWR(5745, 5825, 30, 6, 0),
-};
+static bool is_unknown_alpha2(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	/* Special case where regulatory domain was built by driver
+	 * but a specific alpha2 cannot be determined */
+	if (alpha2[0] == '9' && alpha2[1] == '9')
+		return true;
+	return false;
+}
 
-static const struct ieee80211_channel_range ieee80211_JP_channels[] = {
-	/* IEEE 802.11b/g, channels 1..14 */
-	RANGE_PWR(2412, 2484, 20, 6, 0),
-	/* IEEE 802.11a, channels 34..48 */
-	RANGE_PWR(5170, 5240, 20, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channels 52..64 */
-	RANGE_PWR(5260, 5320, 20, 6, IEEE80211_CHAN_NO_IBSS |
-				     IEEE80211_CHAN_RADAR),
-};
+static bool is_an_alpha2(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (is_alpha_upper(alpha2[0]) && is_alpha_upper(alpha2[1]))
+		return true;
+	return false;
+}
 
-static const struct ieee80211_channel_range ieee80211_EU_channels[] = {
-	/* IEEE 802.11b/g, channels 1..13 */
-	RANGE_PWR(2412, 2472, 20, 6, 0),
-	/* IEEE 802.11a, channel 36*/
-	RANGE_PWR(5180, 5180, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channel 40*/
-	RANGE_PWR(5200, 5200, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channel 44*/
-	RANGE_PWR(5220, 5220, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channels 48..64 */
-	RANGE_PWR(5240, 5320, 23, 6, IEEE80211_CHAN_NO_IBSS |
-				     IEEE80211_CHAN_RADAR),
-	/* IEEE 802.11a, channels 100..140 */
-	RANGE_PWR(5500, 5700, 30, 6, IEEE80211_CHAN_NO_IBSS |
-				     IEEE80211_CHAN_RADAR),
-};
+static bool alpha2_equal(char *alpha2_x, char *alpha2_y)
+{
+	if (!alpha2_x || !alpha2_y)
+		return false;
+	if (alpha2_x[0] == alpha2_y[0] &&
+		alpha2_x[1] == alpha2_y[1])
+		return true;
+	return false;
+}
+
+static bool regdom_changed(char *alpha2)
+{
+	if (!cfg80211_regdomain)
+		return true;
+	if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+		return false;
+	return true;
+}
+
+/* This lets us keep regulatory code which is updated on a regulatory
+ * basis in userspace. */
+static int call_crda(const char *alpha2)
+{
+	char country_env[9 + 2] = "COUNTRY=";
+	char *envp[] = {
+		country_env,
+		NULL
+	};
+
+	if (!is_world_regdom((char *) alpha2))
+		printk(KERN_INFO "cfg80211: Calling CRDA for country: %c%c\n",
+			alpha2[0], alpha2[1]);
+	else
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+		return -EINVAL;
+#else
+		printk(KERN_INFO "cfg80211: Calling CRDA to update world "
+			"regulatory domain\n");
+#endif
+
+	country_env[8] = alpha2[0];
+	country_env[9] = alpha2[1];
+
+	return kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, envp);
+}
+
+/* This has the logic which determines when a new request
+ * should be ignored. */
+static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
+	char *alpha2, struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *last_request = NULL;
 
-#define REGDOM(_code)							\
-	{								\
-		.code = __stringify(_code),				\
-		.ranges = ieee80211_ ##_code## _channels,		\
-		.n_ranges = ARRAY_SIZE(ieee80211_ ##_code## _channels),	\
+	/* All initial requests are respected */
+	if (list_empty(&regulatory_requests))
+		return 0;
+
+	last_request = list_first_entry(&regulatory_requests,
+		struct regulatory_request, list);
+
+	switch (set_by) {
+	case REGDOM_SET_BY_INIT:
+		return -EINVAL;
+	case REGDOM_SET_BY_CORE:
+		/* Always respect new wireless core hints, should only
+		 * come in for updating the world regulatory domain at init
+		 * anyway */
+		return 0;
+	case REGDOM_SET_BY_COUNTRY_IE:
+		if (last_request->initiator == set_by) {
+			if (last_request->wiphy != wiphy) {
+				/* Two cards with two APs claiming different
+				 * different Country IE alpha2s!
+				 * You're special!! */
+				if (!alpha2_equal(last_request->alpha2,
+						cfg80211_regdomain->alpha2)) {
+					/* XXX: Deal with conflict, consider
+					 * building a new one out of the
+					 * intersection */
+					WARN_ON(1);
+					return -EOPNOTSUPP;
+				}
+				return -EALREADY;
+			}
+			/* Two consecutive Country IE hints on the same wiphy */
+			if (!alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+				return 0;
+			return -EALREADY;
+		}
+		if (WARN_ON(!is_alpha2_set(alpha2) || !is_an_alpha2(alpha2)),
+				"Invalid Country IE regulatory hint passed "
+				"to the wireless core\n")
+			return -EINVAL;
+		/* We ignore Country IE hints for now, as we haven't yet
+		 * added the dot11MultiDomainCapabilityEnabled flag
+		 * for wiphys */
+		return 1;
+	case REGDOM_SET_BY_DRIVER:
+		BUG_ON(!wiphy);
+		if (last_request->initiator == set_by) {
+			/* Two separate drivers hinting different things,
+			 * this is possible if you have two devices present
+			 * on a system with different EEPROM regulatory
+			 * readings. XXX: Do intersection, we support only
+			 * the first regulatory hint for now */
+			if (last_request->wiphy != wiphy)
+				return -EALREADY;
+			if (rd)
+				return -EALREADY;
+			/* Driver should not be trying to hint different
+			 * regulatory domains! */
+			BUG_ON(!alpha2_equal(alpha2,
+					cfg80211_regdomain->alpha2));
+			return -EALREADY;
+		}
+		if (last_request->initiator == REGDOM_SET_BY_CORE)
+			return 0;
+		/* XXX: Handle intersection, and add the
+		 * dot11MultiDomainCapabilityEnabled flag to wiphy. For now
+		 * we assume the driver has this set to false, following the
+		 * 802.11d dot11MultiDomainCapabilityEnabled documentation */
+		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+			return 0;
+		return 0;
+	case REGDOM_SET_BY_USER:
+		if (last_request->initiator == set_by ||
+				last_request->initiator == REGDOM_SET_BY_CORE)
+			return 0;
+		/* Drivers can use their wiphy's reg_notifier()
+		 * to override any information */
+		if (last_request->initiator == REGDOM_SET_BY_DRIVER)
+			return 0;
+		/* XXX: Handle intersection */
+		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+			return -EOPNOTSUPP;
+		return 0;
+	default:
+		return -EINVAL;
 	}
+}
 
-static const struct ieee80211_regdomain ieee80211_regdoms[] = {
-	REGDOM(US),
-	REGDOM(JP),
-	REGDOM(EU),
-};
+static bool __reg_is_valid_request(char *alpha2,
+	struct regulatory_request **request)
+{
+	struct regulatory_request *req;
+	if (list_empty(&regulatory_requests))
+		return false;
+	list_for_each_entry(req, &regulatory_requests, list) {
+		if (alpha2_equal(req->alpha2, alpha2)) {
+			*request = req;
+			return true;
+		}
+	}
+	return false;
+}
 
+/* Used by nl80211 before kmalloc'ing our regulatory domain */
+bool reg_is_valid_request(char *alpha2)
+{
+	struct regulatory_request *request = NULL;
+	return  __reg_is_valid_request(alpha2, &request);
+}
 
-static const struct ieee80211_regdomain *get_regdom(void)
+/* Sanity check on a regulatory rule */
+static bool is_valid_reg_rule(struct ieee80211_reg_rule *rule)
 {
-	static const struct ieee80211_channel_range
-	ieee80211_world_channels[] = {
-		/* IEEE 802.11b/g, channels 1..11 */
-		RANGE_PWR(2412, 2462, 27, 6, 0),
-	};
-	static const struct ieee80211_regdomain regdom_world = REGDOM(world);
-	int i;
+	struct ieee80211_freq_range *freq_range = &rule->freq_range;
+	u32 freq_diff;
+
+	if (freq_range->start_freq_khz == 0 || freq_range->end_freq_khz == 0)
+		return false;
+
+	if (freq_range->start_freq_khz > freq_range->end_freq_khz)
+		return false;
+
+	freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
+
+	if (freq_range->max_bandwidth_khz > freq_diff)
+		return false;
+
+	return true;
+}
+
+static bool is_valid_rd(struct ieee80211_regdomain *rd)
+{
+	struct ieee80211_reg_rule *reg_rule = NULL;
+	unsigned int i;
 
-	for (i = 0; i < ARRAY_SIZE(ieee80211_regdoms); i++)
-		if (strcmp(ieee80211_regdom, ieee80211_regdoms[i].code) == 0)
-			return &ieee80211_regdoms[i];
+	if (!rd->n_reg_rules)
+		return false;
 
-	return &regdom_world;
+	for (i = 0; i < rd->n_reg_rules; i++) {
+		reg_rule = &rd->reg_rules[i];
+		if (!is_valid_reg_rule(reg_rule))
+			return false;
+	}
+
+	return true;
 }
 
+/* Returns value in KHz */
+static u32 freq_max_bandwidth(const struct ieee80211_freq_range *freq_range,
+	u32 freq)
+{
+	unsigned int i;
+	for (i = 0; i < ARRAY_SIZE(supported_bandwidths); i++) {
+		u32 start_freq_khz = freq - supported_bandwidths[i]/2;
+		u32 end_freq_khz = freq + supported_bandwidths[i]/2;
+		if (start_freq_khz >= freq_range->start_freq_khz &&
+			end_freq_khz <= freq_range->end_freq_khz)
+			return supported_bandwidths[i];
+	}
+	return 0;
+}
 
-static void handle_channel(struct ieee80211_channel *chan,
-			   const struct ieee80211_regdomain *rd)
+/* XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
+ * want to just have the channel structure use these */
+static u32 map_regdom_flags(u32 rd_flags)
+{
+	u32 channel_flags = 0;
+	if (rd_flags & NL80211_RRF_PASSIVE_SCAN)
+		channel_flags |= IEEE80211_CHAN_PASSIVE_SCAN;
+	if (rd_flags & NL80211_RRF_NO_IBSS)
+		channel_flags |= IEEE80211_CHAN_NO_IBSS;
+	if (rd_flags & NL80211_RRF_DFS)
+		channel_flags |= IEEE80211_CHAN_RADAR;
+	return channel_flags;
+}
+
+/**
+ * freq_reg_info - get regulatory information for the given frequency
+ * @center_freq: Frequency in KHz for which we want regulatory information for
+ * @bandwidth: the bandwidth requirement you have in KHz, if you do not have one
+ * 	you can set this to 0. If this frequency is allowed we then set
+ * 	this value to the maximum allowed bandwidth.
+ * @reg_rule: the regulatory rule which we have for this frequency
+ *
+ * Use this function to get the regulatory rule for a specific frequency.
+ */
+static int freq_reg_info(u32 center_freq, u32 *bandwidth,
+			 const struct ieee80211_reg_rule **reg_rule)
 {
 	int i;
-	u32 flags = chan->orig_flags;
-	const struct ieee80211_channel_range *rg = NULL;
+	u32 max_bandwidth = 0;
 
-	for (i = 0; i < rd->n_ranges; i++) {
-		if (rd->ranges[i].start_freq <= chan->center_freq &&
-		    chan->center_freq <= rd->ranges[i].end_freq) {
-			rg = &rd->ranges[i];
+	if (!cfg80211_regdomain)
+		return -EINVAL;
+
+	for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) {
+		const struct ieee80211_reg_rule *rr;
+		const struct ieee80211_freq_range *fr = NULL;
+		const struct ieee80211_power_rule *pr = NULL;
+
+		rr = &cfg80211_regdomain->reg_rules[i];
+		fr = &rr->freq_range;
+		pr = &rr->power_rule;
+		max_bandwidth = freq_max_bandwidth(fr, center_freq);
+		if (max_bandwidth && *bandwidth <= max_bandwidth) {
+			*reg_rule = rr;
+			*bandwidth = max_bandwidth;
 			break;
 		}
 	}
 
-	if (!rg) {
-		/* not found */
+	return !max_bandwidth;
+}
+
+static void handle_channel(struct ieee80211_channel *chan)
+{
+	int r;
+	u32 flags = chan->orig_flags;
+	u32 max_bandwidth = 0;
+	const struct ieee80211_reg_rule *reg_rule = NULL;
+	const struct ieee80211_power_rule *power_rule = NULL;
+
+	r = freq_reg_info(MHZ_TO_KHZ(chan->center_freq),
+		&max_bandwidth, &reg_rule);
+
+	if (r) {
 		flags |= IEEE80211_CHAN_DISABLED;
 		chan->flags = flags;
 		return;
 	}
 
-	chan->flags = flags;
+	power_rule = &reg_rule->power_rule;
+
+	chan->flags = flags | map_regdom_flags(reg_rule->flags);
 	chan->max_antenna_gain = min(chan->orig_mag,
-					 rg->max_antenna_gain);
+		(int) MBI_TO_DBI(power_rule->max_antenna_gain));
+	chan->max_bandwidth = KHZ_TO_MHZ(max_bandwidth);
 	if (chan->orig_mpwr)
-		chan->max_power = min(chan->orig_mpwr, rg->max_power);
+		chan->max_power = min(chan->orig_mpwr,
+			(int) MBM_TO_DBM(power_rule->max_eirp));
 	else
-		chan->max_power = rg->max_power;
+		chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp);
 }
 
-static void handle_band(struct ieee80211_supported_band *sband,
-			const struct ieee80211_regdomain *rd)
+static void handle_band(struct ieee80211_supported_band *sband)
 {
 	int i;
 
 	for (i = 0; i < sband->n_channels; i++)
-		handle_channel(&sband->channels[i], rd);
+		handle_channel(&sband->channels[i]);
 }
 
-void wiphy_update_regulatory(struct wiphy *wiphy)
+static void update_all_wiphy_regulatory(enum reg_set_by setby)
 {
-	enum ieee80211_band band;
-	const struct ieee80211_regdomain *rd = get_regdom();
+	struct cfg80211_registered_device *drv;
 
-	for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+	list_for_each_entry(drv, &cfg80211_drv_list, list)
+		wiphy_update_regulatory(&drv->wiphy, setby);
+}
+
+void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby)
+{
+	enum ieee80211_band band;
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
 		if (wiphy->bands[band])
-			handle_band(wiphy->bands[band], rd);
+			handle_band(wiphy->bands[band]);
+		if (wiphy->reg_notifier)
+			wiphy->reg_notifier(wiphy, setby);
+	}
+}
+
+/* Caller must hold &cfg80211_drv_mutex */
+int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
+		      const char *alpha2, struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *request;
+	char *rd_alpha2;
+	int r = 0;
+
+	r = ignore_request(wiphy, set_by, (char *) alpha2, rd);
+	if (r)
+		return r;
+
+	if (rd)
+		rd_alpha2 = rd->alpha2;
+	else
+		rd_alpha2 = (char *) alpha2;
+
+	switch (set_by) {
+	case REGDOM_SET_BY_CORE:
+	case REGDOM_SET_BY_COUNTRY_IE:
+	case REGDOM_SET_BY_DRIVER:
+	case REGDOM_SET_BY_USER:
+		request = kzalloc(sizeof(struct regulatory_request),
+			GFP_KERNEL);
+		if (!request)
+			return -ENOMEM;
+
+		request->alpha2[0] = rd_alpha2[0];
+		request->alpha2[1] = rd_alpha2[1];
+		request->initiator = set_by;
+		request->wiphy = wiphy;
+
+		list_add_tail(&request->list, &regulatory_requests);
+		if (rd)
+			break;
+		r = call_crda(alpha2);
+#ifndef CONFIG_WIRELESS_OLD_REGULATORY
+		if (r)
+			printk(KERN_ERR "cfg80211: Failed calling CRDA\n");
+#endif
+		break;
+	default:
+		r = -ENOTSUPP;
+		break;
+	}
+
+	return r;
+}
+
+/* If rd is not NULL and if this call fails the caller must free it */
+int regulatory_hint(struct wiphy *wiphy, const char *alpha2,
+	struct ieee80211_regdomain *rd)
+{
+	int r;
+	BUG_ON(!rd && !alpha2);
+
+	mutex_lock(&cfg80211_drv_mutex);
+
+	r = __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, alpha2, rd);
+	if (r || !rd)
+		goto unlock_and_exit;
+
+	/* If the driver passed a regulatory domain we skipped asking
+	 * userspace for one so we can now go ahead and set it */
+	r = set_regdom(rd);
+
+unlock_and_exit:
+	mutex_unlock(&cfg80211_drv_mutex);
+	return r;
+}
+EXPORT_SYMBOL(regulatory_hint);
+
+
+static void print_rd_rules(struct ieee80211_regdomain *rd)
+{
+	unsigned int i;
+	struct ieee80211_reg_rule *reg_rule = NULL;
+	struct ieee80211_freq_range *freq_range = NULL;
+	struct ieee80211_power_rule *power_rule = NULL;
+
+	printk(KERN_INFO "\t(start_freq - end_freq @ bandwidth), "
+		"(max_antenna_gain, max_eirp)\n");
+
+	for (i = 0; i < rd->n_reg_rules; i++) {
+		reg_rule = &rd->reg_rules[i];
+		freq_range = &reg_rule->freq_range;
+		power_rule = &reg_rule->power_rule;
+
+		/* There may not be documentation for max antenna gain
+		 * in certain regions */
+		if (power_rule->max_antenna_gain)
+			printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), "
+				"(%d mBi, %d mBm)\n",
+				freq_range->start_freq_khz,
+				freq_range->end_freq_khz,
+				freq_range->max_bandwidth_khz,
+				power_rule->max_antenna_gain,
+				power_rule->max_eirp);
+		else
+			printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), "
+				"(N/A, %d mBm)\n",
+				freq_range->start_freq_khz,
+				freq_range->end_freq_khz,
+				freq_range->max_bandwidth_khz,
+				power_rule->max_eirp);
+	}
+}
+
+static void print_regdomain(struct ieee80211_regdomain *rd)
+{
+
+	if (is_world_regdom(rd->alpha2))
+		printk(KERN_INFO "cfg80211: World regulatory "
+			"domain updated:\n");
+	else {
+		if (is_unknown_alpha2(rd->alpha2))
+			printk(KERN_INFO "cfg80211: Regulatory domain "
+				"changed to driver built-in settings "
+				"(unknown country)\n");
+		else
+			printk(KERN_INFO "cfg80211: Regulatory domain "
+				"changed to country: %c%c\n",
+				rd->alpha2[0], rd->alpha2[1]);
+	}
+	print_rd_rules(rd);
+}
+
+void print_regdomain_info(struct ieee80211_regdomain *rd)
+{
+	printk(KERN_INFO "cfg80211: Regulatory domain: %c%c\n",
+		rd->alpha2[0], rd->alpha2[1]);
+	print_rd_rules(rd);
+}
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+
+static bool is_old_static_regdom(struct ieee80211_regdomain *rd)
+{
+	if (rd == &us_regdom || rd == &jp_regdom || rd == &eu_regdom)
+		return true;
+	return false;
+}
+
+/* The old crap never deals with a world regulatory domain, it only
+ * deals with the static regulatory domain passed and if possible
+ * an updated "US" or "JP" regulatory domain. We do however store the
+ * old static regulatory domain in cfg80211_world_regdom for convenience
+ * of use here */
+static void reset_regdomains_static(void)
+{
+	if (!is_old_static_regdom(cfg80211_regdomain))
+		kfree(cfg80211_regdomain);
+	/* This is setting the regdom to the old static regdom */
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+}
+#else
+static void reset_regdomains(void)
+{
+	if (cfg80211_world_regdom && cfg80211_world_regdom != &world_regdom) {
+		if (cfg80211_world_regdom == cfg80211_regdomain) {
+			kfree(cfg80211_regdomain);
+		} else {
+			kfree(cfg80211_world_regdom);
+			kfree(cfg80211_regdomain);
+		}
+	} else if (cfg80211_regdomain && cfg80211_regdomain != &world_regdom)
+		kfree(cfg80211_regdomain);
+
+	cfg80211_world_regdom = (struct ieee80211_regdomain *) &world_regdom;
+	cfg80211_regdomain = NULL;
+}
+
+/* Dynamic world regulatory domain requested by the wireless
+ * core upon initialization */
+static void update_world_regdomain(struct ieee80211_regdomain *rd)
+{
+	BUG_ON(list_empty(&regulatory_requests));
+
+	reset_regdomains();
+
+	cfg80211_world_regdom = rd;
+	cfg80211_regdomain = rd;
+}
+#endif
+
+static int __set_regdom(struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *request = NULL;
+
+	/* Some basic sanity checks first */
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	/* We ignore the world regdom with the old static regdomains setup
+	 * as there is no point to it with satic regulatory definitions :(
+	 * Don't worry this shit will be removed soon... */
+	if (is_world_regdom(rd->alpha2))
+		return -EINVAL;
+#else
+	if (is_world_regdom(rd->alpha2)) {
+		if (WARN_ON(!__reg_is_valid_request(rd->alpha2, &request)))
+			return -EINVAL;
+		update_world_regdomain(rd);
+		return 0;
+	}
+#endif
+
+	if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) &&
+			!is_unknown_alpha2(rd->alpha2))
+		return -EINVAL;
+
+	if (list_empty(&regulatory_requests))
+		return -EINVAL;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	/* Static "US" and "JP" will be overridden, but just once */
+	if (!is_old_static_regdom(cfg80211_regdomain) &&
+			!regdom_changed(rd->alpha2))
+		return -EINVAL;
+#else
+	if (!regdom_changed(rd->alpha2))
+		return -EINVAL;
+#endif
+
+	/* Now lets set the regulatory domain, update all driver channels
+	 * and finally inform them of what we have done, in case they want
+	 * to review or adjust their own settings based on their own
+	 * internal EEPROM data */
+
+	if (WARN_ON(!__reg_is_valid_request(rd->alpha2, &request)))
+		return -EINVAL;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	reset_regdomains_static();
+#else
+	reset_regdomains();
+#endif
+
+	/* Country IE parsing coming soon */
+	switch (request->initiator) {
+	case REGDOM_SET_BY_CORE:
+	case REGDOM_SET_BY_DRIVER:
+	case REGDOM_SET_BY_USER:
+		if (!is_valid_rd(rd)) {
+			printk(KERN_ERR "cfg80211: Invalid "
+				"regulatory domain detected:\n");
+			print_regdomain_info(rd);
+			return -EINVAL;
+		}
+		break;
+	case REGDOM_SET_BY_COUNTRY_IE: /* Not yet */
+		WARN_ON(1);
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* Tada! */
+	cfg80211_regdomain = rd;
+	request->granted = 1;
+
+	return 0;
+}
+
+
+/* Use this call to set the current regulatory domain. Conflicts with
+ * multiple drivers can be ironed out later. Caller must've already
+ * kmalloc'd the rd structure. If this calls fails you should kfree()
+ * the passed rd. Caller must hold cfg80211_drv_mutex */
+int set_regdom(struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *this_request = NULL, *prev_request = NULL;
+	int r;
+
+	if (!list_empty(&regulatory_requests))
+		prev_request = list_first_entry(&regulatory_requests,
+			struct regulatory_request, list);
+
+	/* Note that this doesn't update the wiphys, this is done below */
+	r = __set_regdom(rd);
+	if (r)
+		return r;
+
+	BUG_ON((!__reg_is_valid_request(rd->alpha2, &this_request)));
+
+	/* The initial standard core update of the world regulatory domain, no
+	 * need to keep that request info around if it didn't fail. */
+	if (is_world_regdom(rd->alpha2) &&
+			this_request->initiator == REGDOM_SET_BY_CORE &&
+			this_request->granted) {
+		list_del(&this_request->list);
+		kfree(this_request);
+		this_request = NULL;
+	}
+
+	/* Remove old requests, we only leave behind the last one */
+	if (prev_request) {
+		list_del(&prev_request->list);
+		kfree(prev_request);
+		prev_request = NULL;
+	}
+
+	/* This would make this whole thing pointless */
+	BUG_ON(rd != cfg80211_regdomain);
+
+	/* update all wiphys now with the new established regulatory domain */
+	update_all_wiphy_regulatory(this_request->initiator);
+
+	print_regdomain(rd);
+
+	return r;
+}
+
+int regulatory_init(void)
+{
+	reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
+	if (IS_ERR(reg_pdev))
+		return PTR_ERR(reg_pdev);
+	return 0;
+}
+
+void regulatory_exit(void)
+{
+	struct regulatory_request *req, *req_tmp;
+	mutex_lock(&cfg80211_drv_mutex);
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	reset_regdomains_static();
+#else
+	reset_regdomains();
+#endif
+	list_for_each_entry_safe(req, req_tmp, &regulatory_requests, list) {
+		list_del(&req->list);
+		kfree(req);
+	}
+	platform_device_unregister(reg_pdev);
+	mutex_unlock(&cfg80211_drv_mutex);
 }
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
new file mode 100644
index 00000000000..d75fd023297
--- /dev/null
+++ b/net/wireless/reg.h
@@ -0,0 +1,44 @@
+#ifndef __NET_WIRELESS_REG_H
+#define __NET_WIRELESS_REG_H
+
+extern const struct ieee80211_regdomain world_regdom;
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+extern const struct ieee80211_regdomain us_regdom;
+extern const struct ieee80211_regdomain jp_regdom;
+extern const struct ieee80211_regdomain eu_regdom;
+#endif
+
+extern struct ieee80211_regdomain *cfg80211_regdomain;
+extern struct ieee80211_regdomain *cfg80211_world_regdom;
+extern struct list_head regulatory_requests;
+
+struct regdom_last_setby {
+	struct wiphy *wiphy;
+	u8 initiator;
+};
+
+/* wiphy is set if this request's initiator is REGDOM_SET_BY_DRIVER */
+struct regulatory_request {
+	struct list_head list;
+	struct wiphy *wiphy;
+	int granted;
+	enum reg_set_by initiator;
+	char alpha2[2];
+};
+
+bool is_world_regdom(char *alpha2);
+bool reg_is_valid_request(char *alpha2);
+
+int set_regdom(struct ieee80211_regdomain *rd);
+int __regulatory_hint_alpha2(struct wiphy *wiphy, enum reg_set_by set_by,
+		      const char *alpha2);
+
+int regulatory_init(void);
+void regulatory_exit(void);
+
+void print_regdomain_info(struct ieee80211_regdomain *);
+
+/* If a char is A-Z */
+#define IS_ALPHA(letter) (letter >= 65 && letter <= 90)
+
+#endif  /* __NET_WIRELESS_REG_H */
-- 
cgit v1.2.3


From 24723d1bc9da79a53d0495b9cf9ee18747121b03 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:46 +0200
Subject: mac80211: move ieee80211_sta_expire

ieee80211_sta_expire uses the internal __sta_info_unlink
function which can become static if this function is moved
to sta_info.c.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c     | 26 --------------------------
 net/mac80211/sta_info.c | 28 +++++++++++++++++++++++++++-
 net/mac80211/sta_info.h |  3 ++-
 3 files changed, 29 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2c06f6965b7..ffc47c81a16 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1913,32 +1913,6 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 }
 
 
-static void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sta_info *sta, *tmp;
-	LIST_HEAD(tmp_list);
-	DECLARE_MAC_BUF(mac);
-	unsigned long flags;
-
-	spin_lock_irqsave(&local->sta_lock, flags);
-	list_for_each_entry_safe(sta, tmp, &local->sta_list, list)
-		if (time_after(jiffies, sta->last_rx + exp_time)) {
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-			printk(KERN_DEBUG "%s: expiring inactive STA %s\n",
-			       sdata->dev->name, print_mac(mac, sta->addr));
-#endif
-			__sta_info_unlink(&sta);
-			if (sta)
-				list_add(&sta->list, &tmp_list);
-		}
-	spin_unlock_irqrestore(&local->sta_lock, flags);
-
-	list_for_each_entry_safe(sta, tmp, &tmp_list, list)
-		sta_info_destroy(sta);
-}
-
-
 static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index f2ba653b9d6..3370b262563 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -424,7 +424,7 @@ void sta_info_clear_tim_bit(struct sta_info *sta)
 	spin_unlock_irqrestore(&sta->local->sta_lock, flags);
 }
 
-void __sta_info_unlink(struct sta_info **sta)
+static void __sta_info_unlink(struct sta_info **sta)
 {
 	struct ieee80211_local *local = (*sta)->local;
 	struct ieee80211_sub_if_data *sdata = (*sta)->sdata;
@@ -802,3 +802,29 @@ void sta_info_flush_delayed(struct ieee80211_sub_if_data *sdata)
 		schedule_work(&local->sta_flush_work);
 	spin_unlock_irqrestore(&local->sta_lock, flags);
 }
+
+void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
+			  unsigned long exp_time)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta, *tmp;
+	LIST_HEAD(tmp_list);
+	DECLARE_MAC_BUF(mac);
+	unsigned long flags;
+
+	spin_lock_irqsave(&local->sta_lock, flags);
+	list_for_each_entry_safe(sta, tmp, &local->sta_list, list)
+		if (time_after(jiffies, sta->last_rx + exp_time)) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+			printk(KERN_DEBUG "%s: expiring inactive STA %s\n",
+			       sdata->dev->name, print_mac(mac, sta->addr));
+#endif
+			__sta_info_unlink(&sta);
+			if (sta)
+				list_add(&sta->list, &tmp_list);
+		}
+	spin_unlock_irqrestore(&local->sta_lock, flags);
+
+	list_for_each_entry_safe(sta, tmp, &tmp_list, list)
+		sta_info_destroy(sta);
+}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 4a581a5b576..22007990099 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -452,7 +452,6 @@ int sta_info_insert(struct sta_info *sta);
  * has already unlinked it.
  */
 void sta_info_unlink(struct sta_info **sta);
-void __sta_info_unlink(struct sta_info **sta);
 
 void sta_info_destroy(struct sta_info *sta);
 void sta_info_set_tim_bit(struct sta_info *sta);
@@ -464,5 +463,7 @@ void sta_info_stop(struct ieee80211_local *local);
 int sta_info_flush(struct ieee80211_local *local,
 		    struct ieee80211_sub_if_data *sdata);
 void sta_info_flush_delayed(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
+			  unsigned long exp_time);
 
 #endif /* STA_INFO_H */
-- 
cgit v1.2.3


From a1678f84bff9b20807f7f6a45ebfb56a0c02b353 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:47 +0200
Subject: mac80211: move STA timer restart

This I shouldn't have moved to the scan implementation, move
it back to the MLME where it belongs, to the notification.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 13 +++++++++++++
 net/mac80211/scan.c | 14 --------------
 2 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ffc47c81a16..809fb916089 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2594,6 +2594,13 @@ void ieee80211_sta_work(struct work_struct *work)
 	}
 }
 
+static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
+{
+	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
+	    ieee80211_vif_is_mesh(&sdata->vif))
+		ieee80211_sta_timer((unsigned long)sdata);
+}
+
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
 {
 	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
@@ -2606,4 +2613,10 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
 		    !ieee80211_sta_active_ibss(sdata)))
 			ieee80211_sta_find_ibss(sdata, ifsta);
 	}
+
+	/* Restart STA timers */
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list)
+		ieee80211_restart_sta_timer(sdata);
+	rcu_read_unlock();
 }
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 010781b806f..f8b296bed0b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -424,13 +424,6 @@ static void ieee80211_send_nullfunc(struct ieee80211_local *local,
 	ieee80211_tx_skb(sdata, skb, 0);
 }
 
-static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
-{
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    ieee80211_vif_is_mesh(&sdata->vif))
-		ieee80211_sta_timer((unsigned long)sdata);
-}
-
 void ieee80211_scan_completed(struct ieee80211_hw *hw)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
@@ -446,11 +439,6 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 		if (ieee80211_hw_config(local))
 			printk(KERN_DEBUG "%s: failed to restore operational "
 			       "channel after scan\n", wiphy_name(local->hw.wiphy));
-		/* Restart STA timer for HW scan case */
-		rcu_read_lock();
-		list_for_each_entry_rcu(sdata, &local->interfaces, list)
-			ieee80211_restart_sta_timer(sdata);
-		rcu_read_unlock();
 
 		goto done;
 	}
@@ -483,8 +471,6 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 			}
 		} else
 			netif_tx_wake_all_queues(sdata->dev);
-
-		ieee80211_restart_sta_timer(sdata);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 7c95069522d02ff144cd421be6618dce619caf7e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:48 +0200
Subject: mac80211: dont set REQ_RUN when scan finishes

The timer restart is done wrongly, we shouldn't set the REQ_RUN
bit when the scan has finished if it hadn't been set before the
scan started. If the timer fires during the scan, it will set
REQ_RUN and then we can run the work for it, if it didn't fire
then we shouldn't run its work either.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 809fb916089..ef73f895372 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2598,7 +2598,8 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
 {
 	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
 	    ieee80211_vif_is_mesh(&sdata->vif))
-		ieee80211_sta_timer((unsigned long)sdata);
+		queue_work(sdata->local->hw.workqueue,
+			   &sdata->u.sta.work);
 }
 
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
-- 
cgit v1.2.3


From 472dbc45dc1966284de72d7de15690c17ed2cf33 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:49 +0200
Subject: mac80211: split off mesh handling entirely

This patch splits off mesh handling from the STA/IBSS.
Unfortunately it increases mesh code size a bit, but I
think it makes things clearer. The patch also reduces
per-interface run-time memory usage.

Also clean up a few places where ifdef is not required.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c              |   6 +-
 net/mac80211/debugfs_netdev.c   |  38 ++---
 net/mac80211/ieee80211_i.h      | 101 +++++++------
 net/mac80211/iface.c            |  13 +-
 net/mac80211/main.c             |  13 +-
 net/mac80211/mesh.c             | 328 +++++++++++++++++++++++++++++++++-------
 net/mac80211/mesh.h             |  15 +-
 net/mac80211/mesh_hwmp.c        |  98 ++++++------
 net/mac80211/mesh_pathtbl.c     |   8 +-
 net/mac80211/mesh_plink.c       |  18 +--
 net/mac80211/mlme.c             |  83 +---------
 net/mac80211/rc80211_pid_algo.c |   7 +-
 net/mac80211/rx.c               |   8 +-
 net/mac80211/scan.c             |   1 +
 net/mac80211/tx.c               |  15 +-
 15 files changed, 457 insertions(+), 295 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5a3bdaad6c1..6ec2127f9a6 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -116,9 +116,9 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 		return ret;
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) && params->mesh_id_len)
-		ieee80211_if_sta_set_mesh_id(&sdata->u.sta,
-					     params->mesh_id_len,
-					     params->mesh_id);
+		ieee80211_sdata_set_mesh_id(sdata,
+					    params->mesh_id_len,
+					    params->mesh_id);
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_MNTR || !flags)
 		return 0;
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 8165df578c9..0fa7681a3d2 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -207,37 +207,37 @@ IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC);
 
 #ifdef CONFIG_MAC80211_MESH
 /* Mesh stats attributes */
-IEEE80211_IF_FILE(fwded_frames, u.sta.mshstats.fwded_frames, DEC);
-IEEE80211_IF_FILE(dropped_frames_ttl, u.sta.mshstats.dropped_frames_ttl, DEC);
+IEEE80211_IF_FILE(fwded_frames, u.mesh.mshstats.fwded_frames, DEC);
+IEEE80211_IF_FILE(dropped_frames_ttl, u.mesh.mshstats.dropped_frames_ttl, DEC);
 IEEE80211_IF_FILE(dropped_frames_no_route,
-		u.sta.mshstats.dropped_frames_no_route, DEC);
-IEEE80211_IF_FILE(estab_plinks, u.sta.mshstats.estab_plinks, ATOMIC);
+		u.mesh.mshstats.dropped_frames_no_route, DEC);
+IEEE80211_IF_FILE(estab_plinks, u.mesh.mshstats.estab_plinks, ATOMIC);
 
 /* Mesh parameters */
 IEEE80211_IF_WFILE(dot11MeshMaxRetries,
-		u.sta.mshcfg.dot11MeshMaxRetries, DEC, u8);
+		u.mesh.mshcfg.dot11MeshMaxRetries, DEC, u8);
 IEEE80211_IF_WFILE(dot11MeshRetryTimeout,
-		u.sta.mshcfg.dot11MeshRetryTimeout, DEC, u16);
+		u.mesh.mshcfg.dot11MeshRetryTimeout, DEC, u16);
 IEEE80211_IF_WFILE(dot11MeshConfirmTimeout,
-		u.sta.mshcfg.dot11MeshConfirmTimeout, DEC, u16);
+		u.mesh.mshcfg.dot11MeshConfirmTimeout, DEC, u16);
 IEEE80211_IF_WFILE(dot11MeshHoldingTimeout,
-		u.sta.mshcfg.dot11MeshHoldingTimeout, DEC, u16);
-IEEE80211_IF_WFILE(dot11MeshTTL, u.sta.mshcfg.dot11MeshTTL, DEC, u8);
-IEEE80211_IF_WFILE(auto_open_plinks, u.sta.mshcfg.auto_open_plinks, DEC, u8);
+		u.mesh.mshcfg.dot11MeshHoldingTimeout, DEC, u16);
+IEEE80211_IF_WFILE(dot11MeshTTL, u.mesh.mshcfg.dot11MeshTTL, DEC, u8);
+IEEE80211_IF_WFILE(auto_open_plinks, u.mesh.mshcfg.auto_open_plinks, DEC, u8);
 IEEE80211_IF_WFILE(dot11MeshMaxPeerLinks,
-		u.sta.mshcfg.dot11MeshMaxPeerLinks, DEC, u16);
+		u.mesh.mshcfg.dot11MeshMaxPeerLinks, DEC, u16);
 IEEE80211_IF_WFILE(dot11MeshHWMPactivePathTimeout,
-		u.sta.mshcfg.dot11MeshHWMPactivePathTimeout, DEC, u32);
+		u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout, DEC, u32);
 IEEE80211_IF_WFILE(dot11MeshHWMPpreqMinInterval,
-		u.sta.mshcfg.dot11MeshHWMPpreqMinInterval, DEC, u16);
+		u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval, DEC, u16);
 IEEE80211_IF_WFILE(dot11MeshHWMPnetDiameterTraversalTime,
-		u.sta.mshcfg.dot11MeshHWMPnetDiameterTraversalTime, DEC, u16);
+		u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime, DEC, u16);
 IEEE80211_IF_WFILE(dot11MeshHWMPmaxPREQretries,
-		u.sta.mshcfg.dot11MeshHWMPmaxPREQretries, DEC, u8);
+		u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries, DEC, u8);
 IEEE80211_IF_WFILE(path_refresh_time,
-		u.sta.mshcfg.path_refresh_time, DEC, u32);
+		u.mesh.mshcfg.path_refresh_time, DEC, u32);
 IEEE80211_IF_WFILE(min_discovery_timeout,
-		u.sta.mshcfg.min_discovery_timeout, DEC, u16);
+		u.mesh.mshcfg.min_discovery_timeout, DEC, u16);
 #endif
 
 
@@ -350,7 +350,7 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
 		add_mesh_stats(sdata);
 		add_mesh_config(sdata);
 #endif
-		/* fall through */
+		break;
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
 		add_sta_files(sdata);
@@ -487,7 +487,7 @@ static void del_files(struct ieee80211_sub_if_data *sdata)
 		del_mesh_stats(sdata);
 		del_mesh_config(sdata);
 #endif
-		/* fall through */
+		break;
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
 		del_sta_files(sdata);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6f334e4c3d6..cac0b133454 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -308,7 +308,6 @@ enum ieee80211_sta_mlme_state {
 	IEEE80211_STA_MLME_ASSOCIATED,
 	IEEE80211_STA_MLME_IBSS_SEARCH,
 	IEEE80211_STA_MLME_IBSS_JOINED,
-	IEEE80211_STA_MLME_MESH_UP
 };
 
 /* bitfield of allowed auth algs */
@@ -325,34 +324,6 @@ struct ieee80211_if_sta {
 	size_t ssid_len;
 	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
 	size_t scan_ssid_len;
-#ifdef CONFIG_MAC80211_MESH
-	struct timer_list mesh_path_timer;
-	u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN];
-	size_t mesh_id_len;
-	/* Active Path Selection Protocol Identifier */
-	u8 mesh_pp_id[4];
-	/* Active Path Selection Metric Identifier */
-	u8 mesh_pm_id[4];
-	/* Congestion Control Mode Identifier */
-	u8 mesh_cc_id[4];
-	/* Local mesh Destination Sequence Number */
-	u32 dsn;
-	/* Last used PREQ ID */
-	u32 preq_id;
-	atomic_t mpaths;
-	/* Timestamp of last DSN update */
-	unsigned long last_dsn_update;
-	/* Timestamp of last DSN sent */
-	unsigned long last_preq;
-	struct mesh_rmc *rmc;
-	spinlock_t mesh_preq_queue_lock;
-	struct mesh_preq_queue preq_queue;
-	int preq_queue_len;
-	struct mesh_stats mshstats;
-	struct mesh_config mshcfg;
-	u32 mesh_seqnum;
-	bool accepting_plinks;
-#endif
 	u16 aid;
 	u16 ap_capab, capab;
 	u8 *extra_ie; /* to be added to the end of AssocReq */
@@ -387,20 +358,47 @@ struct ieee80211_if_sta {
 	int num_beacons; /* number of TXed beacon frames by this STA */
 };
 
-static inline void ieee80211_if_sta_set_mesh_id(struct ieee80211_if_sta *ifsta,
-						u8 mesh_id_len, u8 *mesh_id)
-{
-#ifdef CONFIG_MAC80211_MESH
-	ifsta->mesh_id_len = mesh_id_len;
-	memcpy(ifsta->mesh_id, mesh_id, mesh_id_len);
-#endif
-}
+struct ieee80211_if_mesh {
+	struct work_struct work;
+	struct timer_list housekeeping_timer;
+	struct timer_list mesh_path_timer;
+	struct sk_buff_head skb_queue;
+
+	bool housekeeping;
+
+	u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN];
+	size_t mesh_id_len;
+	/* Active Path Selection Protocol Identifier */
+	u8 mesh_pp_id[4];
+	/* Active Path Selection Metric Identifier */
+	u8 mesh_pm_id[4];
+	/* Congestion Control Mode Identifier */
+	u8 mesh_cc_id[4];
+	/* Local mesh Destination Sequence Number */
+	u32 dsn;
+	/* Last used PREQ ID */
+	u32 preq_id;
+	atomic_t mpaths;
+	/* Timestamp of last DSN update */
+	unsigned long last_dsn_update;
+	/* Timestamp of last DSN sent */
+	unsigned long last_preq;
+	struct mesh_rmc *rmc;
+	spinlock_t mesh_preq_queue_lock;
+	struct mesh_preq_queue preq_queue;
+	int preq_queue_len;
+	struct mesh_stats mshstats;
+	struct mesh_config mshcfg;
+	u32 mesh_seqnum;
+	bool accepting_plinks;
+	int num_beacons;
+};
 
 #ifdef CONFIG_MAC80211_MESH
-#define IEEE80211_IFSTA_MESH_CTR_INC(sta, name)	\
-	do { (sta)->mshstats.name++; } while (0)
+#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name)	\
+	do { (msh)->mshstats.name++; } while (0)
 #else
-#define IEEE80211_IFSTA_MESH_CTR_INC(sta, name) \
+#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \
 	do { } while (0)
 #endif
 
@@ -455,6 +453,9 @@ struct ieee80211_sub_if_data {
 		struct ieee80211_if_wds wds;
 		struct ieee80211_if_vlan vlan;
 		struct ieee80211_if_sta sta;
+#ifdef CONFIG_MAC80211_MESH
+		struct ieee80211_if_mesh mesh;
+#endif
 		u32 mntr_flags;
 	} u;
 
@@ -548,6 +549,19 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
 	return container_of(p, struct ieee80211_sub_if_data, vif);
 }
 
+static inline void
+ieee80211_sdata_set_mesh_id(struct ieee80211_sub_if_data *sdata,
+			    u8 mesh_id_len, u8 *mesh_id)
+{
+#ifdef CONFIG_MAC80211_MESH
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	ifmsh->mesh_id_len = mesh_id_len;
+	memcpy(ifmsh->mesh_id, mesh_id, mesh_id_len);
+#else
+	WARN_ON(1);
+#endif
+}
+
 enum {
 	IEEE80211_RX_MSG	= 1,
 	IEEE80211_TX_STATUS_MSG	= 2,
@@ -935,13 +949,6 @@ ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
 void ieee80211_rx_bss_put(struct ieee80211_local *local,
 			  struct ieee80211_sta_bss *bss);
 
-#ifdef CONFIG_MAC80211_MESH
-void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
-#else
-static inline void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
-{}
-#endif
-
 /* interface handling */
 void ieee80211_if_setup(struct net_device *dev);
 int ieee80211_if_add(struct ieee80211_local *local, const char *name,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 672cec60a2f..ddbaa417e2e 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -54,10 +54,9 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 
 		break;
 	case IEEE80211_IF_TYPE_MESH_POINT:
-		/* Allow compiler to elide mesh_rmc_free call. */
 		if (ieee80211_vif_is_mesh(&sdata->vif))
 			mesh_rmc_free(sdata);
-		/* fall through */
+		break;
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
 		kfree(sdata->u.sta.extra_ie);
@@ -100,7 +99,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 		skb_queue_head_init(&sdata->u.ap.ps_bc_buf);
 		INIT_LIST_HEAD(&sdata->u.ap.vlans);
 		break;
-	case IEEE80211_IF_TYPE_MESH_POINT:
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
 		ifsta = &sdata->u.sta;
@@ -117,7 +115,8 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 			IEEE80211_STA_AUTO_CHANNEL_SEL;
 		if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4)
 			ifsta->flags |= IEEE80211_STA_WMM_ENABLED;
-
+		break;
+	case IEEE80211_IF_TYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
 			ieee80211_mesh_init_sdata(sdata);
 		break;
@@ -225,9 +224,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) &&
 	    params && params->mesh_id_len)
-		ieee80211_if_sta_set_mesh_id(&sdata->u.sta,
-					     params->mesh_id_len,
-					     params->mesh_id);
+		ieee80211_sdata_set_mesh_id(sdata,
+					    params->mesh_id_len,
+					    params->mesh_id);
 
 	list_add_tail_rcu(&sdata->list, &local->interfaces);
 
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 6a7f4fae18c..522fe617648 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -252,6 +252,8 @@ static int ieee80211_open(struct net_device *dev)
 		sdata->bss = &sdata->u.ap;
 		break;
 	case IEEE80211_IF_TYPE_MESH_POINT:
+		if (!ieee80211_vif_is_mesh(&sdata->vif))
+			break;
 		/* mesh ifaces must set allmulti to forward mcast traffic */
 		atomic_inc(&local->iff_allmultis);
 		break;
@@ -540,10 +542,6 @@ static int ieee80211_stop(struct net_device *dev)
 		ieee80211_configure_filter(local);
 		netif_addr_unlock_bh(local->mdev);
 		break;
-	case IEEE80211_IF_TYPE_MESH_POINT:
-		/* allmulti is always set on mesh ifaces */
-		atomic_dec(&local->iff_allmultis);
-		/* fall through */
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
 		sdata->u.sta.state = IEEE80211_STA_MLME_DISABLED;
@@ -571,6 +569,13 @@ static int ieee80211_stop(struct net_device *dev)
 		sdata->u.sta.extra_ie = NULL;
 		sdata->u.sta.extra_ie_len = 0;
 		/* fall through */
+	case IEEE80211_IF_TYPE_MESH_POINT:
+		if (ieee80211_vif_is_mesh(&sdata->vif)) {
+			/* allmulti is always set on mesh ifaces */
+			atomic_dec(&local->iff_allmultis);
+			ieee80211_stop_mesh(sdata);
+		}
+		/* fall through */
 	default:
 		conf.vif = &sdata->vif;
 		conf.type = sdata->vif.type;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 3ccb3599c04..9e47725cc59 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -12,6 +12,9 @@
 #include "ieee80211_i.h"
 #include "mesh.h"
 
+#define IEEE80211_MESH_PEER_INACTIVITY_LIMIT (1800 * HZ)
+#define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ)
+
 #define PP_OFFSET 	1		/* Path Selection Protocol */
 #define PM_OFFSET	5		/* Path Selection Metric   */
 #define CC_OFFSET	9		/* Congestion Control Mode */
@@ -35,6 +38,16 @@ void ieee80211s_stop(void)
 	kmem_cache_destroy(rm_cache);
 }
 
+static void ieee80211_mesh_housekeeping_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata = (void *) data;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	ifmsh->housekeeping = true;
+	queue_work(local->hw.workqueue, &ifmsh->work);
+}
+
 /**
  * mesh_matches_local - check if the config of a mesh point matches ours
  *
@@ -46,7 +59,7 @@ void ieee80211s_stop(void)
  */
 bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_if_sta *sta = &sdata->u.sta;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 
 	/*
 	 * As support for each feature is added, check for matching
@@ -58,11 +71,11 @@ bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_dat
 	 *   - MDA enabled
 	 * - Power management control on fc
 	 */
-	if (sta->mesh_id_len == ie->mesh_id_len &&
-		memcmp(sta->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 &&
-		memcmp(sta->mesh_pp_id, ie->mesh_config + PP_OFFSET, 4) == 0 &&
-		memcmp(sta->mesh_pm_id, ie->mesh_config + PM_OFFSET, 4) == 0 &&
-		memcmp(sta->mesh_cc_id, ie->mesh_config + CC_OFFSET, 4) == 0)
+	if (ifmsh->mesh_id_len == ie->mesh_id_len &&
+		memcmp(ifmsh->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 &&
+		memcmp(ifmsh->mesh_pp_id, ie->mesh_config + PP_OFFSET, 4) == 0 &&
+		memcmp(ifmsh->mesh_pm_id, ie->mesh_config + PM_OFFSET, 4) == 0 &&
+		memcmp(ifmsh->mesh_cc_id, ie->mesh_config + CC_OFFSET, 4) == 0)
 		return true;
 
 	return false;
@@ -95,11 +108,11 @@ void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata)
 	 */
 	free_plinks = mesh_plink_availables(sdata);
 
-	if (free_plinks != sdata->u.sta.accepting_plinks)
-		ieee80211_sta_timer((unsigned long) sdata);
+	if (free_plinks != sdata->u.mesh.accepting_plinks)
+		ieee80211_mesh_housekeeping_timer((unsigned long) sdata);
 }
 
-void mesh_ids_set_default(struct ieee80211_if_sta *sta)
+void mesh_ids_set_default(struct ieee80211_if_mesh *sta)
 {
 	u8 def_id[4] = {0x00, 0x0F, 0xAC, 0xff};
 
@@ -112,22 +125,22 @@ int mesh_rmc_init(struct ieee80211_sub_if_data *sdata)
 {
 	int i;
 
-	sdata->u.sta.rmc = kmalloc(sizeof(struct mesh_rmc), GFP_KERNEL);
-	if (!sdata->u.sta.rmc)
+	sdata->u.mesh.rmc = kmalloc(sizeof(struct mesh_rmc), GFP_KERNEL);
+	if (!sdata->u.mesh.rmc)
 		return -ENOMEM;
-	sdata->u.sta.rmc->idx_mask = RMC_BUCKETS - 1;
+	sdata->u.mesh.rmc->idx_mask = RMC_BUCKETS - 1;
 	for (i = 0; i < RMC_BUCKETS; i++)
-		INIT_LIST_HEAD(&sdata->u.sta.rmc->bucket[i].list);
+		INIT_LIST_HEAD(&sdata->u.mesh.rmc->bucket[i].list);
 	return 0;
 }
 
 void mesh_rmc_free(struct ieee80211_sub_if_data *sdata)
 {
-	struct mesh_rmc *rmc = sdata->u.sta.rmc;
+	struct mesh_rmc *rmc = sdata->u.mesh.rmc;
 	struct rmc_entry *p, *n;
 	int i;
 
-	if (!sdata->u.sta.rmc)
+	if (!sdata->u.mesh.rmc)
 		return;
 
 	for (i = 0; i < RMC_BUCKETS; i++)
@@ -137,7 +150,7 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata)
 		}
 
 	kfree(rmc);
-	sdata->u.sta.rmc = NULL;
+	sdata->u.mesh.rmc = NULL;
 }
 
 /**
@@ -155,7 +168,7 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata)
 int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr,
 		   struct ieee80211_sub_if_data *sdata)
 {
-	struct mesh_rmc *rmc = sdata->u.sta.rmc;
+	struct mesh_rmc *rmc = sdata->u.mesh.rmc;
 	u32 seqnum = 0;
 	int entries = 0;
 	u8 idx;
@@ -217,11 +230,11 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 		}
 	}
 
-	pos = skb_put(skb, 2 + sdata->u.sta.mesh_id_len);
+	pos = skb_put(skb, 2 + sdata->u.mesh.mesh_id_len);
 	*pos++ = WLAN_EID_MESH_ID;
-	*pos++ = sdata->u.sta.mesh_id_len;
-	if (sdata->u.sta.mesh_id_len)
-		memcpy(pos, sdata->u.sta.mesh_id, sdata->u.sta.mesh_id_len);
+	*pos++ = sdata->u.mesh.mesh_id_len;
+	if (sdata->u.mesh.mesh_id_len)
+		memcpy(pos, sdata->u.mesh.mesh_id, sdata->u.mesh.mesh_id_len);
 
 	pos = skb_put(skb, 21);
 	*pos++ = WLAN_EID_MESH_CONFIG;
@@ -230,15 +243,15 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 	*pos++ = 1;
 
 	/* Active path selection protocol ID */
-	memcpy(pos, sdata->u.sta.mesh_pp_id, 4);
+	memcpy(pos, sdata->u.mesh.mesh_pp_id, 4);
 	pos += 4;
 
 	/* Active path selection metric ID   */
-	memcpy(pos, sdata->u.sta.mesh_pm_id, 4);
+	memcpy(pos, sdata->u.mesh.mesh_pm_id, 4);
 	pos += 4;
 
 	/* Congestion control mode identifier */
-	memcpy(pos, sdata->u.sta.mesh_cc_id, 4);
+	memcpy(pos, sdata->u.mesh.mesh_cc_id, 4);
 	pos += 4;
 
 	/* Channel precedence:
@@ -248,8 +261,8 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 	pos += 4;
 
 	/* Mesh capability */
-	sdata->u.sta.accepting_plinks = mesh_plink_availables(sdata);
-	*pos++ = sdata->u.sta.accepting_plinks ? ACCEPT_PLINKS : 0x00;
+	sdata->u.mesh.accepting_plinks = mesh_plink_availables(sdata);
+	*pos++ = sdata->u.mesh.accepting_plinks ? ACCEPT_PLINKS : 0x00;
 	*pos++ = 0x00;
 
 	return;
@@ -337,10 +350,10 @@ static void ieee80211_mesh_path_timer(unsigned long data)
 {
 	struct ieee80211_sub_if_data *sdata =
 		(struct ieee80211_sub_if_data *) data;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct ieee80211_local *local = wdev_priv(&sdata->wdev);
 
-	queue_work(local->hw.workqueue, &ifsta->work);
+	queue_work(local->hw.workqueue, &ifmsh->work);
 }
 
 struct mesh_table *mesh_table_grow(struct mesh_table *tbl)
@@ -392,50 +405,255 @@ int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr,
 		struct ieee80211_sub_if_data *sdata)
 {
 	meshhdr->flags = 0;
-	meshhdr->ttl = sdata->u.sta.mshcfg.dot11MeshTTL;
-	put_unaligned(cpu_to_le32(sdata->u.sta.mesh_seqnum), &meshhdr->seqnum);
-	sdata->u.sta.mesh_seqnum++;
+	meshhdr->ttl = sdata->u.mesh.mshcfg.dot11MeshTTL;
+	put_unaligned(cpu_to_le32(sdata->u.mesh.mesh_seqnum), &meshhdr->seqnum);
+	sdata->u.mesh.mesh_seqnum++;
 
 	return 6;
 }
 
+static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata,
+			   struct ieee80211_if_mesh *ifmsh)
+{
+	bool free_plinks;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	printk(KERN_DEBUG "%s: running mesh housekeeping\n",
+	       sdata->dev->name);
+#endif
+
+	ieee80211_sta_expire(sdata, IEEE80211_MESH_PEER_INACTIVITY_LIMIT);
+	mesh_path_expire(sdata);
+
+	free_plinks = mesh_plink_availables(sdata);
+	if (free_plinks != sdata->u.mesh.accepting_plinks)
+		ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON);
+
+	ifmsh->housekeeping = false;
+	mod_timer(&ifmsh->housekeeping_timer,
+		  round_jiffies(jiffies + IEEE80211_MESH_HOUSEKEEPING_INTERVAL));
+}
+
+
+void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct ieee80211_local *local = sdata->local;
+
+	ifmsh->housekeeping = true;
+	queue_work(local->hw.workqueue, &ifmsh->work);
+	ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON);
+}
+
+void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
+{
+	del_timer_sync(&sdata->u.mesh.housekeeping_timer);
+	/*
+	 * When we get here, the interface is marked down.
+	 * Call synchronize_rcu() to wait for the RX path
+	 * should it be using the interface and enqueuing
+	 * frames at this very time on another CPU.
+	 */
+	synchronize_rcu();
+	skb_queue_purge(&sdata->u.mesh.skb_queue);
+}
+
+static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
+					u16 stype,
+					struct ieee80211_mgmt *mgmt,
+					size_t len,
+					struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_local *local= sdata->local;
+	struct ieee802_11_elems elems;
+	struct ieee80211_channel *channel;
+	u64 supp_rates = 0;
+	size_t baselen;
+	int freq;
+	enum ieee80211_band band = rx_status->band;
+
+	/* ignore ProbeResp to foreign address */
+	if (stype == IEEE80211_STYPE_PROBE_RESP &&
+	    compare_ether_addr(mgmt->da, sdata->dev->dev_addr))
+		return;
+
+	baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+			       &elems);
+
+	if (elems.ds_params && elems.ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems.ds_params[0]);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return;
+
+	if (elems.mesh_id && elems.mesh_config &&
+	    mesh_matches_local(&elems, sdata)) {
+		supp_rates = ieee80211_sta_get_rates(local, &elems, band);
+
+		mesh_neighbour_update(mgmt->sa, supp_rates, sdata,
+				      mesh_peer_accepts_plinks(&elems));
+	}
+}
+
+static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
+					  struct ieee80211_mgmt *mgmt,
+					  size_t len,
+					  struct ieee80211_rx_status *rx_status)
+{
+	switch (mgmt->u.action.category) {
+	case PLINK_CATEGORY:
+		mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
+		break;
+	case MESH_PATH_SEL_CATEGORY:
+		mesh_rx_path_sel_frame(sdata, mgmt, len);
+		break;
+	}
+}
+
+static void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+					  struct sk_buff *skb)
+{
+	struct ieee80211_rx_status *rx_status;
+	struct ieee80211_if_mesh *ifmsh;
+	struct ieee80211_mgmt *mgmt;
+	u16 stype;
+
+	ifmsh = &sdata->u.mesh;
+
+	rx_status = (struct ieee80211_rx_status *) skb->cb;
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
+
+	switch (stype) {
+	case IEEE80211_STYPE_PROBE_RESP:
+	case IEEE80211_STYPE_BEACON:
+		ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len,
+					    rx_status);
+		break;
+	case IEEE80211_STYPE_ACTION:
+		ieee80211_mesh_rx_mgmt_action(sdata, mgmt, skb->len, rx_status);
+		break;
+	}
+
+	kfree_skb(skb);
+}
+
+static void ieee80211_mesh_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data, u.mesh.work);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct sk_buff *skb;
+
+	if (!netif_running(sdata->dev))
+		return;
+
+	if (local->sta_sw_scanning || local->sta_hw_scanning)
+		return;
+
+	while ((skb = skb_dequeue(&ifmsh->skb_queue)))
+		ieee80211_mesh_rx_queued_mgmt(sdata, skb);
+
+	if (ifmsh->preq_queue_len &&
+	    time_after(jiffies,
+		       ifmsh->last_preq + msecs_to_jiffies(ifmsh->mshcfg.dot11MeshHWMPpreqMinInterval)))
+		mesh_path_start_discovery(sdata);
+
+	if (ifmsh->housekeeping)
+		ieee80211_mesh_housekeeping(sdata, ifmsh);
+}
+
+void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list)
+		if (ieee80211_vif_is_mesh(&sdata->vif))
+			queue_work(local->hw.workqueue, &sdata->u.mesh.work);
+	rcu_read_unlock();
+}
+
 void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-
-	ifsta->mshcfg.dot11MeshRetryTimeout = MESH_RET_T;
-	ifsta->mshcfg.dot11MeshConfirmTimeout = MESH_CONF_T;
-	ifsta->mshcfg.dot11MeshHoldingTimeout = MESH_HOLD_T;
-	ifsta->mshcfg.dot11MeshMaxRetries = MESH_MAX_RETR;
-	ifsta->mshcfg.dot11MeshTTL = MESH_TTL;
-	ifsta->mshcfg.auto_open_plinks = true;
-	ifsta->mshcfg.dot11MeshMaxPeerLinks =
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	INIT_WORK(&ifmsh->work, ieee80211_mesh_work);
+	setup_timer(&ifmsh->housekeeping_timer,
+		    ieee80211_mesh_housekeeping_timer,
+		    (unsigned long) sdata);
+	skb_queue_head_init(&sdata->u.mesh.skb_queue);
+
+	ifmsh->mshcfg.dot11MeshRetryTimeout = MESH_RET_T;
+	ifmsh->mshcfg.dot11MeshConfirmTimeout = MESH_CONF_T;
+	ifmsh->mshcfg.dot11MeshHoldingTimeout = MESH_HOLD_T;
+	ifmsh->mshcfg.dot11MeshMaxRetries = MESH_MAX_RETR;
+	ifmsh->mshcfg.dot11MeshTTL = MESH_TTL;
+	ifmsh->mshcfg.auto_open_plinks = true;
+	ifmsh->mshcfg.dot11MeshMaxPeerLinks =
 		MESH_MAX_ESTAB_PLINKS;
-	ifsta->mshcfg.dot11MeshHWMPactivePathTimeout =
+	ifmsh->mshcfg.dot11MeshHWMPactivePathTimeout =
 		MESH_PATH_TIMEOUT;
-	ifsta->mshcfg.dot11MeshHWMPpreqMinInterval =
+	ifmsh->mshcfg.dot11MeshHWMPpreqMinInterval =
 		MESH_PREQ_MIN_INT;
-	ifsta->mshcfg.dot11MeshHWMPnetDiameterTraversalTime =
+	ifmsh->mshcfg.dot11MeshHWMPnetDiameterTraversalTime =
 		MESH_DIAM_TRAVERSAL_TIME;
-	ifsta->mshcfg.dot11MeshHWMPmaxPREQretries =
+	ifmsh->mshcfg.dot11MeshHWMPmaxPREQretries =
 		MESH_MAX_PREQ_RETRIES;
-	ifsta->mshcfg.path_refresh_time =
+	ifmsh->mshcfg.path_refresh_time =
 		MESH_PATH_REFRESH_TIME;
-	ifsta->mshcfg.min_discovery_timeout =
+	ifmsh->mshcfg.min_discovery_timeout =
 		MESH_MIN_DISCOVERY_TIMEOUT;
-	ifsta->accepting_plinks = true;
-	ifsta->preq_id = 0;
-	ifsta->dsn = 0;
-	atomic_set(&ifsta->mpaths, 0);
+	ifmsh->accepting_plinks = true;
+	ifmsh->preq_id = 0;
+	ifmsh->dsn = 0;
+	atomic_set(&ifmsh->mpaths, 0);
 	mesh_rmc_init(sdata);
-	ifsta->last_preq = jiffies;
+	ifmsh->last_preq = jiffies;
 	/* Allocate all mesh structures when creating the first mesh interface. */
 	if (!mesh_allocated)
 		ieee80211s_init();
-	mesh_ids_set_default(ifsta);
-	setup_timer(&ifsta->mesh_path_timer,
+	mesh_ids_set_default(ifmsh);
+	setup_timer(&ifmsh->mesh_path_timer,
 		    ieee80211_mesh_path_timer,
 		    (unsigned long) sdata);
-	INIT_LIST_HEAD(&ifsta->preq_queue.list);
-	spin_lock_init(&ifsta->mesh_preq_queue_lock);
+	INIT_LIST_HEAD(&ifmsh->preq_queue.list);
+	spin_lock_init(&ifmsh->mesh_preq_queue_lock);
+}
+
+ieee80211_rx_result
+ieee80211_mesh_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		       struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct ieee80211_mgmt *mgmt;
+	u16 fc;
+
+	if (skb->len < 24)
+		return RX_DROP_MONITOR;
+
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = le16_to_cpu(mgmt->frame_control);
+
+	switch (fc & IEEE80211_FCTL_STYPE) {
+	case IEEE80211_STYPE_PROBE_RESP:
+	case IEEE80211_STYPE_BEACON:
+	case IEEE80211_STYPE_ACTION:
+		memcpy(skb->cb, rx_status, sizeof(*rx_status));
+		skb_queue_tail(&ifmsh->skb_queue, skb);
+		queue_work(local->hw.workqueue, &ifmsh->work);
+		return RX_QUEUED;
+	}
+
+	return RX_CONTINUE;
 }
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 84ff5d828fd..8ee414a0447 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -206,7 +206,7 @@ int mesh_rmc_check(u8 *addr, struct ieee80211s_hdr *mesh_hdr,
 		struct ieee80211_sub_if_data *sdata);
 bool mesh_matches_local(struct ieee802_11_elems *ie,
 		struct ieee80211_sub_if_data *sdata);
-void mesh_ids_set_default(struct ieee80211_if_sta *sta);
+void mesh_ids_set_default(struct ieee80211_if_mesh *mesh);
 void mesh_mgmt_ies_add(struct sk_buff *skb,
 		struct ieee80211_sub_if_data *sdata);
 void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
@@ -214,6 +214,11 @@ int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
 void ieee80211s_init(void);
 void ieee80211s_stop(void);
 void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata);
+ieee80211_rx_result
+ieee80211_mesh_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		       struct ieee80211_rx_status *rx_status);
+void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
+void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata);
 
 /* Mesh paths */
 int mesh_nexthop_lookup(struct sk_buff *skb,
@@ -269,8 +274,8 @@ extern int mesh_allocated;
 
 static inline int mesh_plink_free_count(struct ieee80211_sub_if_data *sdata)
 {
-	return sdata->u.sta.mshcfg.dot11MeshMaxPeerLinks -
-	       atomic_read(&sdata->u.sta.mshstats.estab_plinks);
+	return sdata->u.mesh.mshcfg.dot11MeshMaxPeerLinks -
+	       atomic_read(&sdata->u.mesh.mshstats.estab_plinks);
 }
 
 static inline bool mesh_plink_availables(struct ieee80211_sub_if_data *sdata)
@@ -288,8 +293,12 @@ static inline void mesh_path_activate(struct mesh_path *mpath)
 	for (i = 0; i <= x->hash_mask; i++) \
 		hlist_for_each_entry_rcu(node, p, &x->hash_buckets[i], list)
 
+void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local);
+
 #else
 #define mesh_allocated	0
+static inline void
+ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {}
 #endif
 
 #endif /* IEEE80211S_H */
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 210d6b85240..1fad792ad25 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -64,14 +64,14 @@ static inline u32 u32_field_get(u8 *preq_elem, int offset, bool ae)
 #define DSN_LT(x, y) ((long) (x) - (long) (y) < 0)
 
 #define net_traversal_jiffies(s) \
-	msecs_to_jiffies(s->u.sta.mshcfg.dot11MeshHWMPnetDiameterTraversalTime)
+	msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime)
 #define default_lifetime(s) \
-	MSEC_TO_TU(s->u.sta.mshcfg.dot11MeshHWMPactivePathTimeout)
+	MSEC_TO_TU(s->u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout)
 #define min_preq_int_jiff(s) \
-	(msecs_to_jiffies(s->u.sta.mshcfg.dot11MeshHWMPpreqMinInterval))
-#define max_preq_retries(s) (s->u.sta.mshcfg.dot11MeshHWMPmaxPREQretries)
+	(msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval))
+#define max_preq_retries(s) (s->u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries)
 #define disc_timeout_jiff(s) \
-	msecs_to_jiffies(sdata->u.sta.mshcfg.min_discovery_timeout)
+	msecs_to_jiffies(sdata->u.mesh.mshcfg.min_discovery_timeout)
 
 enum mpath_frame_type {
 	MPATH_PREQ = 0,
@@ -395,7 +395,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_mgmt *mgmt,
 				    u8 *preq_elem, u32 metric) {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct mesh_path *mpath;
 	u8 *dst_addr, *orig_addr;
 	u8 dst_flags, ttl;
@@ -414,11 +414,11 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 		forward = false;
 		reply = true;
 		metric = 0;
-		if (time_after(jiffies, ifsta->last_dsn_update +
+		if (time_after(jiffies, ifmsh->last_dsn_update +
 					net_traversal_jiffies(sdata)) ||
-		    time_before(jiffies, ifsta->last_dsn_update)) {
-			dst_dsn = ++ifsta->dsn;
-			ifsta->last_dsn_update = jiffies;
+		    time_before(jiffies, ifmsh->last_dsn_update)) {
+			dst_dsn = ++ifmsh->dsn;
+			ifmsh->last_dsn_update = jiffies;
 		}
 	} else {
 		rcu_read_lock();
@@ -444,7 +444,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 
 	if (reply) {
 		lifetime = PREQ_IE_LIFETIME(preq_elem);
-		ttl = ifsta->mshcfg.dot11MeshTTL;
+		ttl = ifmsh->mshcfg.dot11MeshTTL;
 		if (ttl != 0)
 			mesh_path_sel_frame_tx(MPATH_PREP, 0, dst_addr,
 				cpu_to_le32(dst_dsn), 0, orig_addr,
@@ -452,7 +452,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 				cpu_to_le32(lifetime), cpu_to_le32(metric),
 				0, sdata);
 		else
-			ifsta->mshstats.dropped_frames_ttl++;
+			ifmsh->mshstats.dropped_frames_ttl++;
 	}
 
 	if (forward) {
@@ -462,7 +462,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 		ttl = PREQ_IE_TTL(preq_elem);
 		lifetime = PREQ_IE_LIFETIME(preq_elem);
 		if (ttl <= 1) {
-			ifsta->mshstats.dropped_frames_ttl++;
+			ifmsh->mshstats.dropped_frames_ttl++;
 			return;
 		}
 		--ttl;
@@ -475,7 +475,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 				hopcount, ttl, cpu_to_le32(lifetime),
 				cpu_to_le32(metric), cpu_to_le32(preq_id),
 				sdata);
-		ifsta->mshstats.fwded_frames++;
+		ifmsh->mshstats.fwded_frames++;
 	}
 }
 
@@ -503,7 +503,7 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 
 	ttl = PREP_IE_TTL(prep_elem);
 	if (ttl <= 1) {
-		sdata->u.sta.mshstats.dropped_frames_ttl++;
+		sdata->u.mesh.mshstats.dropped_frames_ttl++;
 		return;
 	}
 
@@ -533,12 +533,12 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 		cpu_to_le32(lifetime), cpu_to_le32(metric),
 		0, sdata);
 	rcu_read_unlock();
-	sdata->u.sta.mshstats.fwded_frames++;
+	sdata->u.mesh.mshstats.fwded_frames++;
 	return;
 
 fail:
 	rcu_read_unlock();
-	sdata->u.sta.mshstats.dropped_frames_no_route++;
+	sdata->u.mesh.mshstats.dropped_frames_no_route++;
 	return;
 }
 
@@ -631,7 +631,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
 static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
 {
 	struct ieee80211_sub_if_data *sdata = mpath->sdata;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct mesh_preq_queue *preq_node;
 
 	preq_node = kmalloc(sizeof(struct mesh_preq_queue), GFP_KERNEL);
@@ -640,9 +640,9 @@ static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
 		return;
 	}
 
-	spin_lock(&ifsta->mesh_preq_queue_lock);
-	if (ifsta->preq_queue_len == MAX_PREQ_QUEUE_LEN) {
-		spin_unlock(&ifsta->mesh_preq_queue_lock);
+	spin_lock(&ifmsh->mesh_preq_queue_lock);
+	if (ifmsh->preq_queue_len == MAX_PREQ_QUEUE_LEN) {
+		spin_unlock(&ifmsh->mesh_preq_queue_lock);
 		kfree(preq_node);
 		if (printk_ratelimit())
 			printk(KERN_DEBUG "Mesh HWMP: PREQ node queue full\n");
@@ -652,21 +652,21 @@ static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
 	memcpy(preq_node->dst, mpath->dst, ETH_ALEN);
 	preq_node->flags = flags;
 
-	list_add_tail(&preq_node->list, &ifsta->preq_queue.list);
-	++ifsta->preq_queue_len;
-	spin_unlock(&ifsta->mesh_preq_queue_lock);
+	list_add_tail(&preq_node->list, &ifmsh->preq_queue.list);
+	++ifmsh->preq_queue_len;
+	spin_unlock(&ifmsh->mesh_preq_queue_lock);
 
-	if (time_after(jiffies, ifsta->last_preq + min_preq_int_jiff(sdata)))
-		queue_work(sdata->local->hw.workqueue, &ifsta->work);
+	if (time_after(jiffies, ifmsh->last_preq + min_preq_int_jiff(sdata)))
+		queue_work(sdata->local->hw.workqueue, &ifmsh->work);
 
-	else if (time_before(jiffies, ifsta->last_preq)) {
+	else if (time_before(jiffies, ifmsh->last_preq)) {
 		/* avoid long wait if did not send preqs for a long time
 		 * and jiffies wrapped around
 		 */
-		ifsta->last_preq = jiffies - min_preq_int_jiff(sdata) - 1;
-		queue_work(sdata->local->hw.workqueue, &ifsta->work);
+		ifmsh->last_preq = jiffies - min_preq_int_jiff(sdata) - 1;
+		queue_work(sdata->local->hw.workqueue, &ifmsh->work);
 	} else
-		mod_timer(&ifsta->mesh_path_timer, ifsta->last_preq +
+		mod_timer(&ifmsh->mesh_path_timer, ifmsh->last_preq +
 						min_preq_int_jiff(sdata));
 }
 
@@ -677,25 +677,25 @@ static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
  */
 void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct mesh_preq_queue *preq_node;
 	struct mesh_path *mpath;
 	u8 ttl, dst_flags;
 	u32 lifetime;
 
-	spin_lock(&ifsta->mesh_preq_queue_lock);
-	if (!ifsta->preq_queue_len ||
-		time_before(jiffies, ifsta->last_preq +
+	spin_lock(&ifmsh->mesh_preq_queue_lock);
+	if (!ifmsh->preq_queue_len ||
+		time_before(jiffies, ifmsh->last_preq +
 				min_preq_int_jiff(sdata))) {
-		spin_unlock(&ifsta->mesh_preq_queue_lock);
+		spin_unlock(&ifmsh->mesh_preq_queue_lock);
 		return;
 	}
 
-	preq_node = list_first_entry(&ifsta->preq_queue.list,
+	preq_node = list_first_entry(&ifmsh->preq_queue.list,
 			struct mesh_preq_queue, list);
 	list_del(&preq_node->list);
-	--ifsta->preq_queue_len;
-	spin_unlock(&ifsta->mesh_preq_queue_lock);
+	--ifmsh->preq_queue_len;
+	spin_unlock(&ifmsh->mesh_preq_queue_lock);
 
 	rcu_read_lock();
 	mpath = mesh_path_lookup(preq_node->dst, sdata);
@@ -720,18 +720,18 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 		goto enddiscovery;
 	}
 
-	ifsta->last_preq = jiffies;
+	ifmsh->last_preq = jiffies;
 
-	if (time_after(jiffies, ifsta->last_dsn_update +
+	if (time_after(jiffies, ifmsh->last_dsn_update +
 				net_traversal_jiffies(sdata)) ||
-	    time_before(jiffies, ifsta->last_dsn_update)) {
-		++ifsta->dsn;
-		sdata->u.sta.last_dsn_update = jiffies;
+	    time_before(jiffies, ifmsh->last_dsn_update)) {
+		++ifmsh->dsn;
+		sdata->u.mesh.last_dsn_update = jiffies;
 	}
 	lifetime = default_lifetime(sdata);
-	ttl = sdata->u.sta.mshcfg.dot11MeshTTL;
+	ttl = sdata->u.mesh.mshcfg.dot11MeshTTL;
 	if (ttl == 0) {
-		sdata->u.sta.mshstats.dropped_frames_ttl++;
+		sdata->u.mesh.mshstats.dropped_frames_ttl++;
 		spin_unlock_bh(&mpath->state_lock);
 		goto enddiscovery;
 	}
@@ -743,10 +743,10 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 
 	spin_unlock_bh(&mpath->state_lock);
 	mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->dev->dev_addr,
-			cpu_to_le32(ifsta->dsn), dst_flags, mpath->dst,
+			cpu_to_le32(ifmsh->dsn), dst_flags, mpath->dst,
 			cpu_to_le32(mpath->dsn), sdata->dev->broadcast, 0,
 			ttl, cpu_to_le32(lifetime), 0,
-			cpu_to_le32(ifsta->preq_id++), sdata);
+			cpu_to_le32(ifmsh->preq_id++), sdata);
 	mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout);
 
 enddiscovery:
@@ -783,7 +783,7 @@ int mesh_nexthop_lookup(struct sk_buff *skb,
 		mpath = mesh_path_lookup(dst_addr, sdata);
 		if (!mpath) {
 			dev_kfree_skb(skb);
-			sdata->u.sta.mshstats.dropped_frames_no_route++;
+			sdata->u.mesh.mshstats.dropped_frames_no_route++;
 			err = -ENOSPC;
 			goto endlookup;
 		}
@@ -791,7 +791,7 @@ int mesh_nexthop_lookup(struct sk_buff *skb,
 
 	if (mpath->flags & MESH_PATH_ACTIVE) {
 		if (time_after(jiffies, mpath->exp_time -
-			msecs_to_jiffies(sdata->u.sta.mshcfg.path_refresh_time))
+			msecs_to_jiffies(sdata->u.mesh.mshcfg.path_refresh_time))
 				&& !memcmp(sdata->dev->dev_addr, hdr->addr4,
 					   ETH_ALEN)
 				&& !(mpath->flags & MESH_PATH_RESOLVING)
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 0a60f55f32a..e4fa2905fad 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -153,7 +153,7 @@ int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata)
 	if (is_multicast_ether_addr(dst))
 		return -ENOTSUPP;
 
-	if (atomic_add_unless(&sdata->u.sta.mpaths, 1, MESH_MAX_MPATHS) == 0)
+	if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0)
 		return -ENOSPC;
 
 	err = -ENOMEM;
@@ -221,7 +221,7 @@ err_exists:
 err_node_alloc:
 	kfree(new_mpath);
 err_path_alloc:
-	atomic_dec(&sdata->u.sta.mpaths);
+	atomic_dec(&sdata->u.mesh.mpaths);
 	return err;
 }
 
@@ -306,7 +306,7 @@ static void mesh_path_node_reclaim(struct rcu_head *rp)
 	struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
 
 	del_timer_sync(&node->mpath->timer);
-	atomic_dec(&sdata->u.sta.mpaths);
+	atomic_dec(&sdata->u.mesh.mpaths);
 	kfree(node->mpath);
 	kfree(node);
 }
@@ -401,7 +401,7 @@ void mesh_path_discard_frame(struct sk_buff *skb,
 	}
 
 	kfree_skb(skb);
-	sdata->u.sta.mshstats.dropped_frames_no_route++;
+	sdata->u.mesh.mshstats.dropped_frames_no_route++;
 }
 
 /**
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 7356462dee9..990a4b7f6bc 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -36,11 +36,11 @@
 #define MESH_SECURITY_AUTHENTICATION_IMPOSSIBLE	9
 #define MESH_SECURITY_FAILED_VERIFICATION	10
 
-#define dot11MeshMaxRetries(s) (s->u.sta.mshcfg.dot11MeshMaxRetries)
-#define dot11MeshRetryTimeout(s) (s->u.sta.mshcfg.dot11MeshRetryTimeout)
-#define dot11MeshConfirmTimeout(s) (s->u.sta.mshcfg.dot11MeshConfirmTimeout)
-#define dot11MeshHoldingTimeout(s) (s->u.sta.mshcfg.dot11MeshHoldingTimeout)
-#define dot11MeshMaxPeerLinks(s) (s->u.sta.mshcfg.dot11MeshMaxPeerLinks)
+#define dot11MeshMaxRetries(s) (s->u.mesh.mshcfg.dot11MeshMaxRetries)
+#define dot11MeshRetryTimeout(s) (s->u.mesh.mshcfg.dot11MeshRetryTimeout)
+#define dot11MeshConfirmTimeout(s) (s->u.mesh.mshcfg.dot11MeshConfirmTimeout)
+#define dot11MeshHoldingTimeout(s) (s->u.mesh.mshcfg.dot11MeshHoldingTimeout)
+#define dot11MeshMaxPeerLinks(s) (s->u.mesh.mshcfg.dot11MeshMaxPeerLinks)
 
 enum plink_frame_type {
 	PLINK_OPEN = 0,
@@ -63,14 +63,14 @@ enum plink_event {
 static inline
 void mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata)
 {
-	atomic_inc(&sdata->u.sta.mshstats.estab_plinks);
+	atomic_inc(&sdata->u.mesh.mshstats.estab_plinks);
 	mesh_accept_plinks_update(sdata);
 }
 
 static inline
 void mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata)
 {
-	atomic_dec(&sdata->u.sta.mshstats.estab_plinks);
+	atomic_dec(&sdata->u.mesh.mshstats.estab_plinks);
 	mesh_accept_plinks_update(sdata);
 }
 
@@ -245,8 +245,8 @@ void mesh_neighbour_update(u8 *hw_addr, u64 rates, struct ieee80211_sub_if_data
 	sta->last_rx = jiffies;
 	sta->supp_rates[local->hw.conf.channel->band] = rates;
 	if (peer_accepting_plinks && sta->plink_state == PLINK_LISTEN &&
-			sdata->u.sta.accepting_plinks &&
-			sdata->u.sta.mshcfg.auto_open_plinks)
+			sdata->u.mesh.accepting_plinks &&
+			sdata->u.mesh.mshcfg.auto_open_plinks)
 		mesh_plink_open(sta);
 
 	rcu_read_unlock();
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ef73f895372..9e20a0c20a4 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -22,11 +22,11 @@
 #include <linux/rtnetlink.h>
 #include <net/iw_handler.h>
 #include <net/mac80211.h>
+#include <asm/unaligned.h>
 
 #include "ieee80211_i.h"
 #include "rate.h"
 #include "led.h"
-#include "mesh.h"
 
 #define IEEE80211_ASSOC_SCANS_MAX_TRIES 2
 #define IEEE80211_AUTH_TIMEOUT (HZ / 5)
@@ -34,7 +34,6 @@
 #define IEEE80211_ASSOC_TIMEOUT (HZ / 5)
 #define IEEE80211_ASSOC_MAX_TRIES 3
 #define IEEE80211_MONITORING_INTERVAL (2 * HZ)
-#define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ)
 #define IEEE80211_PROBE_INTERVAL (60 * HZ)
 #define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ)
 #define IEEE80211_SCAN_INTERVAL (2 * HZ)
@@ -43,7 +42,6 @@
 
 #define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
 #define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
-#define IEEE80211_MESH_PEER_INACTIVITY_LIMIT (1800 * HZ)
 
 #define IEEE80211_IBSS_MAX_STA_ENTRIES 128
 
@@ -1508,14 +1506,6 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
 		return;
 
-	if (ieee80211_vif_is_mesh(&sdata->vif) && elems->mesh_id &&
-	    elems->mesh_config && mesh_matches_local(elems, sdata)) {
-		supp_rates = ieee80211_sta_get_rates(local, elems, band);
-
-		mesh_neighbour_update(mgmt->sa, supp_rates, sdata,
-				      mesh_peer_accepts_plinks(elems));
-	}
-
 	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && elems->supp_rates &&
 	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
 		supp_rates = ieee80211_sta_get_rates(local, elems, band);
@@ -1785,26 +1775,6 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
 	ieee80211_tx_skb(sdata, skb, 0);
 }
 
-static void ieee80211_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta,
-				     struct ieee80211_mgmt *mgmt,
-				     size_t len,
-				     struct ieee80211_rx_status *rx_status)
-{
-	/* currently we only handle mesh interface action frames here */
-	if (!ieee80211_vif_is_mesh(&sdata->vif))
-		return;
-
-	switch (mgmt->u.action.category) {
-	case PLINK_CATEGORY:
-		mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
-		break;
-	case MESH_PATH_SEL_CATEGORY:
-		mesh_rx_path_sel_frame(sdata, mgmt, len);
-		break;
-	}
-}
-
 void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 			   struct ieee80211_rx_status *rx_status)
 {
@@ -1825,7 +1795,6 @@ void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *
 	case IEEE80211_STYPE_PROBE_REQ:
 	case IEEE80211_STYPE_PROBE_RESP:
 	case IEEE80211_STYPE_BEACON:
-	case IEEE80211_STYPE_ACTION:
 		memcpy(skb->cb, rx_status, sizeof(*rx_status));
 	case IEEE80211_STYPE_AUTH:
 	case IEEE80211_STYPE_ASSOC_RESP:
@@ -1881,9 +1850,6 @@ static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 	case IEEE80211_STYPE_DISASSOC:
 		ieee80211_rx_mgmt_disassoc(sdata, ifsta, mgmt, skb->len);
 		break;
-	case IEEE80211_STYPE_ACTION:
-		ieee80211_rx_mgmt_action(sdata, ifsta, mgmt, skb->len, rx_status);
-		break;
 	}
 
 	kfree_skb(skb);
@@ -1928,35 +1894,6 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
 }
 
 
-#ifdef CONFIG_MAC80211_MESH
-static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata,
-			   struct ieee80211_if_sta *ifsta)
-{
-	bool free_plinks;
-
-	ieee80211_sta_expire(sdata, IEEE80211_MESH_PEER_INACTIVITY_LIMIT);
-	mesh_path_expire(sdata);
-
-	free_plinks = mesh_plink_availables(sdata);
-	if (free_plinks != sdata->u.sta.accepting_plinks)
-		ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON);
-
-	mod_timer(&ifsta->timer, jiffies +
-			IEEE80211_MESH_HOUSEKEEPING_INTERVAL);
-}
-
-
-void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
-{
-	struct ieee80211_if_sta *ifsta;
-	ifsta = &sdata->u.sta;
-	ifsta->state = IEEE80211_STA_MLME_MESH_UP;
-	ieee80211_sta_timer((unsigned long)sdata);
-	ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON);
-}
-#endif
-
-
 void ieee80211_sta_timer(unsigned long data)
 {
 	struct ieee80211_sub_if_data *sdata =
@@ -2524,21 +2461,13 @@ void ieee80211_sta_work(struct work_struct *work)
 		return;
 
 	if (WARN_ON(sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-		    sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
-		    sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT))
+		    sdata->vif.type != IEEE80211_IF_TYPE_IBSS))
 		return;
 	ifsta = &sdata->u.sta;
 
 	while ((skb = skb_dequeue(&ifsta->skb_queue)))
 		ieee80211_sta_rx_queued_mgmt(sdata, skb);
 
-#ifdef CONFIG_MAC80211_MESH
-	if (ifsta->preq_queue_len &&
-	    time_after(jiffies,
-		       ifsta->last_preq + msecs_to_jiffies(ifsta->mshcfg.dot11MeshHWMPpreqMinInterval)))
-		mesh_path_start_discovery(sdata);
-#endif
-
 	if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
 	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
 	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
@@ -2575,11 +2504,6 @@ void ieee80211_sta_work(struct work_struct *work)
 	case IEEE80211_STA_MLME_IBSS_JOINED:
 		ieee80211_sta_merge_ibss(sdata, ifsta);
 		break;
-#ifdef CONFIG_MAC80211_MESH
-	case IEEE80211_STA_MLME_MESH_UP:
-		ieee80211_mesh_housekeeping(sdata, ifsta);
-		break;
-#endif
 	default:
 		WARN_ON(1);
 		break;
@@ -2596,8 +2520,7 @@ void ieee80211_sta_work(struct work_struct *work)
 
 static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
 {
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    ieee80211_vif_is_mesh(&sdata->vif))
+	if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
 		queue_work(sdata->local->hw.workqueue,
 			   &sdata->u.sta.work);
 }
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index a914ba73ccf..21e1942ea97 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -148,9 +148,7 @@ static void rate_control_pid_sample(struct rc_pid_info *pinfo,
 				    struct ieee80211_local *local,
 				    struct sta_info *sta)
 {
-#ifdef CONFIG_MAC80211_MESH
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
-#endif
 	struct rc_pid_sta_info *spinfo = sta->rate_ctrl_priv;
 	struct rc_pid_rateinfo *rinfo = pinfo->rinfo;
 	struct ieee80211_supported_band *sband;
@@ -181,11 +179,8 @@ static void rate_control_pid_sample(struct rc_pid_info *pinfo,
 		pf = spinfo->last_pf;
 	else {
 		pf = spinfo->tx_num_failed * 100 / spinfo->tx_num_xmit;
-#ifdef CONFIG_MAC80211_MESH
-		if (pf == 100 &&
-		    sdata->vif.type == IEEE80211_IF_TYPE_MESH_POINT)
+		if (ieee80211_vif_is_mesh(&sdata->vif) && pf == 100)
 			mesh_plink_broken(sta);
-#endif
 		pf <<= RC_PID_ARITH_SHIFT;
 		sta->fail_avg = ((pf + (spinfo->last_pf << 3)) / 9)
 					>> RC_PID_ARITH_SHIFT;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index d0803797902..208563a27bc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1404,7 +1404,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 
 	if (rx->flags & IEEE80211_RX_RA_MATCH) {
 		if (!mesh_hdr->ttl)
-			IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.sta,
+			IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh,
 						     dropped_frames_ttl);
 		else {
 			struct ieee80211_hdr *fwd_hdr;
@@ -1591,9 +1591,11 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 	if (!(rx->flags & IEEE80211_RX_RA_MATCH))
 		return RX_DROP_MONITOR;
 
+	if (ieee80211_vif_is_mesh(&sdata->vif))
+		return ieee80211_mesh_rx_mgmt(sdata, rx->skb, rx->status);
+
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT)
+	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
 		return RX_DROP_MONITOR;
 
 	if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index f8b296bed0b..f4399e9ac92 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -476,6 +476,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 
  done:
 	ieee80211_mlme_notify_scan_completed(local);
+	ieee80211_mesh_notify_scan_completed(local);
 }
 EXPORT_SYMBOL(ieee80211_scan_completed);
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c413d4836af..1059b17c83b 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1330,7 +1330,7 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 				if (mesh_nexthop_lookup(skb, osdata))
 					return  0;
 			if (memcmp(odev->dev_addr, hdr->addr4, ETH_ALEN) != 0)
-				IEEE80211_IFSTA_MESH_CTR_INC(&osdata->u.sta,
+				IEEE80211_IFSTA_MESH_CTR_INC(&osdata->u.mesh,
 							     fwded_frames);
 		}
 	}
@@ -1483,9 +1483,9 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 		memcpy(hdr.addr2, dev->dev_addr, ETH_ALEN);
 		memcpy(hdr.addr3, skb->data, ETH_ALEN);
 		memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
-		if (!sdata->u.sta.mshcfg.dot11MeshTTL) {
+		if (!sdata->u.mesh.mshcfg.dot11MeshTTL) {
 			/* Do not send frames with mesh_ttl == 0 */
-			sdata->u.sta.mshstats.dropped_frames_ttl++;
+			sdata->u.mesh.mshstats.dropped_frames_ttl++;
 			ret = 0;
 			goto fail;
 		}
@@ -1815,10 +1815,8 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 	struct rate_selection rsel;
 	struct beacon_data *beacon;
 	struct ieee80211_supported_band *sband;
-	struct ieee80211_mgmt *mgmt;
 	int *num_beacons;
 	enum ieee80211_band band = local->hw.conf.channel->band;
-	u8 *pos;
 
 	sband = local->hw.wiphy->bands[band];
 
@@ -1885,7 +1883,11 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 						 IEEE80211_STYPE_BEACON);
 
 		num_beacons = &ifsta->num_beacons;
+#ifdef CONFIG_MAC80211_MESH
 	} else if (ieee80211_vif_is_mesh(&sdata->vif)) {
+		struct ieee80211_mgmt *mgmt;
+		u8 *pos;
+
 		/* headroom, head length, tail length and maximum TIM length */
 		skb = dev_alloc_skb(local->tx_headroom + 400);
 		if (!skb)
@@ -1910,7 +1912,8 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 
 		mesh_mgmt_ies_add(skb, sdata);
 
-		num_beacons = &sdata->u.sta.num_beacons;
+		num_beacons = &sdata->u.mesh.num_beacons;
+#endif
 	} else {
 		WARN_ON(1);
 		goto out;
-- 
cgit v1.2.3


From b7413430d4d2a6168e68231d9f93763047b6d60c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:50 +0200
Subject: mac80211: fix work race

When we stop an interface, the work on it may still be pending
or running. We do cancel the timer, but we do not currently
protect against the work struct. The race is very unlikely to
hit -- it'll happen only when the driver is using mac80211's
workqueue to run long-running tasks and the sta/mesh works are
delayed for quite a bit.

This patch fixes it by cancelling the work explicitly.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 8 ++++++++
 net/mac80211/mesh.c | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 522fe617648..ebdec7106d6 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -547,6 +547,14 @@ static int ieee80211_stop(struct net_device *dev)
 		sdata->u.sta.state = IEEE80211_STA_MLME_DISABLED;
 		memset(sdata->u.sta.bssid, 0, ETH_ALEN);
 		del_timer_sync(&sdata->u.sta.timer);
+		/*
+		 * If the timer fired while we waited for it, it will have
+		 * requeued the work. Now the work will be running again
+		 * but will not rearm the timer again because it checks
+		 * whether the interface is running, which, at this point,
+		 * it no longer is.
+		 */
+		cancel_work_sync(&sdata->u.sta.work);
 		/*
 		 * When we get here, the interface is marked down.
 		 * Call synchronize_rcu() to wait for the RX path
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 9e47725cc59..a0141f5ff18 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -448,6 +448,15 @@ void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
 {
 	del_timer_sync(&sdata->u.mesh.housekeeping_timer);
+	/*
+	 * If the timer fired while we waited for it, it will have
+	 * requeued the work. Now the work will be running again
+	 * but will not rearm the timer again because it checks
+	 * whether the interface is running, which, at this point,
+	 * it no longer is.
+	 */
+	cancel_work_sync(&sdata->u.mesh.work);
+
 	/*
 	 * When we get here, the interface is marked down.
 	 * Call synchronize_rcu() to wait for the RX path
-- 
cgit v1.2.3


From 5bc75728fd43bb15b46f16ef465bcf9d487393cf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:51 +0200
Subject: mac80211: fix scan vs. interface removal race

When we remove an interface, we can currently end up having
a pointer to it left in local->scan_sdata after it has been
set down, and then with a hardware scan the scan completion
can try to access it which is a bug. Alternatively, a scan
that started as a hardware scan may terminate as though it
was a software scan, if the timing is just right.

On SMP systems, software scan also has a similar problem,
just canceling the delayed work and setting a flag isn't
enough since it may be running concurrently; in this case
we would also never restore state of other interfaces.

This patch hopefully fixes the problems by always invoking
ieee80211_scan_completed or requiring it to be invoked by
the driver, I suspect the drivers that have ->hw_scan() are
buggy. The bug will not manifest itself unless you remove
the interface while hw-scanning which will also turn off
the hw, and then add a new interface which will be unusable
until you scan once.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 33 +++++++++++++++++++++++++--------
 net/mac80211/mlme.c |  2 +-
 net/mac80211/scan.c | 38 +++++++++++++++++++++++++++-----------
 3 files changed, 53 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ebdec7106d6..4bfac4b41c5 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -564,14 +564,6 @@ static int ieee80211_stop(struct net_device *dev)
 		synchronize_rcu();
 		skb_queue_purge(&sdata->u.sta.skb_queue);
 
-		if (local->scan_sdata == sdata) {
-			if (!local->ops->hw_scan) {
-				local->sta_sw_scanning = 0;
-				cancel_delayed_work(&local->scan_work);
-			} else
-				local->sta_hw_scanning = 0;
-		}
-
 		sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
 		kfree(sdata->u.sta.extra_ie);
 		sdata->u.sta.extra_ie = NULL;
@@ -585,6 +577,31 @@ static int ieee80211_stop(struct net_device *dev)
 		}
 		/* fall through */
 	default:
+		if (local->scan_sdata == sdata) {
+			if (!local->ops->hw_scan)
+				cancel_delayed_work_sync(&local->scan_work);
+			/*
+			 * The software scan can no longer run now, so we can
+			 * clear out the scan_sdata reference. However, the
+			 * hardware scan may still be running. The complete
+			 * function must be prepared to handle a NULL value.
+			 */
+			local->scan_sdata = NULL;
+			/*
+			 * The memory barrier guarantees that another CPU
+			 * that is hardware-scanning will now see the fact
+			 * that this interface is gone.
+			 */
+			smp_mb();
+			/*
+			 * If software scanning, complete the scan but since
+			 * the scan_sdata is NULL already don't send out a
+			 * scan event to userspace -- the scan is incomplete.
+			 */
+			if (local->sta_sw_scanning)
+				ieee80211_scan_completed(&local->hw);
+		}
+
 		conf.vif = &sdata->vif;
 		conf.type = sdata->vif.type;
 		conf.mac_addr = dev->dev_addr;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 9e20a0c20a4..19c7f21e49d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2530,7 +2530,7 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
 	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
 	struct ieee80211_if_sta *ifsta;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata && sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
 		ifsta = &sdata->u.sta;
 		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
 		    (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) &&
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index f4399e9ac92..27727027d76 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -430,9 +430,20 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 	struct ieee80211_sub_if_data *sdata;
 	union iwreq_data wrqu;
 
+	if (WARN_ON(!local->sta_hw_scanning && !local->sta_sw_scanning))
+		return;
+
 	local->last_scan_completed = jiffies;
 	memset(&wrqu, 0, sizeof(wrqu));
-	wireless_send_event(local->scan_sdata->dev, SIOCGIWSCAN, &wrqu, NULL);
+
+	/*
+	 * local->scan_sdata could have been NULLed by the interface
+	 * down code in case we were scanning on an interface that is
+	 * being taken down.
+	 */
+	sdata = local->scan_sdata;
+	if (sdata)
+		wireless_send_event(sdata->dev, SIOCGIWSCAN, &wrqu, NULL);
 
 	if (local->sta_hw_scanning) {
 		local->sta_hw_scanning = 0;
@@ -491,7 +502,10 @@ void ieee80211_sta_scan_work(struct work_struct *work)
 	int skip;
 	unsigned long next_delay = 0;
 
-	if (!local->sta_sw_scanning)
+	/*
+	 * Avoid re-scheduling when the sdata is going away.
+	 */
+	if (!netif_running(sdata->dev))
 		return;
 
 	switch (local->scan_state) {
@@ -570,9 +584,8 @@ void ieee80211_sta_scan_work(struct work_struct *work)
 		break;
 	}
 
-	if (local->sta_sw_scanning)
-		queue_delayed_work(local->hw.workqueue, &local->scan_work,
-				   next_delay);
+	queue_delayed_work(local->hw.workqueue, &local->scan_work,
+			   next_delay);
 }
 
 
@@ -609,13 +622,16 @@ int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 	}
 
 	if (local->ops->hw_scan) {
-		int rc = local->ops->hw_scan(local_to_hw(local),
-					     ssid, ssid_len);
-		if (!rc) {
-			local->sta_hw_scanning = 1;
-			local->scan_sdata = scan_sdata;
+		int rc;
+
+		local->sta_hw_scanning = 1;
+		rc = local->ops->hw_scan(local_to_hw(local), ssid, ssid_len);
+		if (rc) {
+			local->sta_hw_scanning = 0;
+			return rc;
 		}
-		return rc;
+		local->scan_sdata = scan_sdata;
+		return 0;
 	}
 
 	local->sta_sw_scanning = 1;
-- 
cgit v1.2.3


From 9c6bd79011b14a8bfe58aad0acfb51e4dca05eed Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:52 +0200
Subject: mac80211: reorder MLME code more

This way all the utility functions are at the top, then the
state machine and externally callable functions are moved to
the bottom. Also clean up ieee80211_i.h a bit and add a few
comments about which functions are called from where.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  39 +--
 net/mac80211/iface.c       |  17 +-
 net/mac80211/mlme.c        | 700 +++++++++++++++++++++++----------------------
 3 files changed, 384 insertions(+), 372 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index cac0b133454..442a43a3400 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -882,54 +882,53 @@ static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
 }
 
 
-/* ieee80211.c */
 int ieee80211_hw_config(struct ieee80211_local *local);
 int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed);
 void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
 u32 ieee80211_handle_ht(struct ieee80211_local *local, int enable_ht,
 			struct ieee80211_ht_info *req_ht_cap,
 			struct ieee80211_ht_bss_info *req_bss_cap);
+void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
+				      u32 changed);
 
-/* ieee80211_ioctl.c */
+/* wireless extensions */
 extern const struct iw_handler_def ieee80211_iw_handler_def;
 int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freq);
 
-/* ieee80211_sta.c */
-void ieee80211_sta_timer(unsigned long data);
-void ieee80211_sta_work(struct work_struct *work);
+/* STA/IBSS code */
+void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
 void ieee80211_sta_scan_work(struct work_struct *work);
 void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 			   struct ieee80211_rx_status *rx_status);
 int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
 int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
 int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
-int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len);
 void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
 			    struct ieee80211_if_sta *ifsta);
-int ieee80211_sta_scan_results(struct ieee80211_local *local,
-			       struct iw_request_info *info,
-			       char *buf, size_t len);
-ieee80211_rx_result ieee80211_sta_rx_scan(
-	struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-	struct ieee80211_rx_status *rx_status);
-void ieee80211_rx_bss_list_init(struct ieee80211_local *local);
-void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local);
-int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len);
 struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 					struct sk_buff *skb, u8 *bssid,
 					u8 *addr, u64 supp_rates);
 int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason);
 int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason);
-void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
-				      u32 changed);
 u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
 u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 			    struct ieee802_11_elems *elems,
 			    enum ieee80211_band band);
 void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 			      u8 *ssid, size_t ssid_len);
-void ieee802_11_parse_elems(u8 *start, size_t len,
-			    struct ieee802_11_elems *elems);
+
+/* scan/BSS handling */
+int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len);
+int ieee80211_sta_scan_results(struct ieee80211_local *local,
+			       struct iw_request_info *info,
+			       char *buf, size_t len);
+ieee80211_rx_result ieee80211_sta_rx_scan(
+	struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+	struct ieee80211_rx_status *rx_status);
+void ieee80211_rx_bss_list_init(struct ieee80211_local *local);
+void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local);
+int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len);
+
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
 int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 			     u8 *ssid, size_t ssid_len);
@@ -1007,6 +1006,8 @@ void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int ke
 void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata);
 void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt);
+void ieee802_11_parse_elems(u8 *start, size_t len,
+			    struct ieee802_11_elems *elems);
 
 #ifdef CONFIG_MAC80211_NOINLINE
 #define debug_noinline noinline
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index ddbaa417e2e..61b19340488 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -83,8 +83,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 				  enum ieee80211_if_types type)
 {
-	struct ieee80211_if_sta *ifsta;
-
 	/* clear type-dependent union */
 	memset(&sdata->u, 0, sizeof(sdata->u));
 
@@ -101,20 +99,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 		break;
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
-		ifsta = &sdata->u.sta;
-		INIT_WORK(&ifsta->work, ieee80211_sta_work);
-		setup_timer(&ifsta->timer, ieee80211_sta_timer,
-			    (unsigned long) sdata);
-		skb_queue_head_init(&ifsta->skb_queue);
-
-		ifsta->capab = WLAN_CAPABILITY_ESS;
-		ifsta->auth_algs = IEEE80211_AUTH_ALG_OPEN |
-			IEEE80211_AUTH_ALG_SHARED_KEY;
-		ifsta->flags |= IEEE80211_STA_CREATE_IBSS |
-			IEEE80211_STA_AUTO_BSSID_SEL |
-			IEEE80211_STA_AUTO_CHANNEL_SEL;
-		if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4)
-			ifsta->flags |= IEEE80211_STA_WMM_ENABLED;
+		ieee80211_sta_setup_sdata(sdata);
 		break;
 	case IEEE80211_IF_TYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 19c7f21e49d..e1483010652 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -93,44 +93,46 @@ static int ieee80211_compatible_rates(struct ieee80211_sta_bss *bss,
 	return count;
 }
 
-/* frame sending functions */
-static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
-				struct ieee80211_if_sta *ifsta,
-				int transaction, u8 *extra, size_t extra_len,
-				int encrypt)
+/* also used by mesh code */
+u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
+			    struct ieee802_11_elems *elems,
+			    enum ieee80211_band band)
 {
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *bitrates;
+	size_t num_rates;
+	u64 supp_rates;
+	int i, j;
+	sband = local->hw.wiphy->bands[band];
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    sizeof(*mgmt) + 6 + extra_len);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
-		       "frame\n", sdata->dev->name);
-		return;
+	if (!sband) {
+		WARN_ON(1);
+		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
-	memset(mgmt, 0, 24 + 6);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_AUTH);
-	if (encrypt)
-		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
-	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
-	ifsta->auth_transaction = transaction + 1;
-	mgmt->u.auth.status_code = cpu_to_le16(0);
-	if (extra)
-		memcpy(skb_put(skb, extra_len), extra, extra_len);
 
-	ieee80211_tx_skb(sdata, skb, encrypt);
+	bitrates = sband->bitrates;
+	num_rates = sband->n_bitrates;
+	supp_rates = 0;
+	for (i = 0; i < elems->supp_rates_len +
+		     elems->ext_supp_rates_len; i++) {
+		u8 rate = 0;
+		int own_rate;
+		if (i < elems->supp_rates_len)
+			rate = elems->supp_rates[i];
+		else if (elems->ext_supp_rates)
+			rate = elems->ext_supp_rates
+				[i - elems->supp_rates_len];
+		own_rate = 5 * (rate & 0x7f);
+		for (j = 0; j < num_rates; j++)
+			if (bitrates[j].bitrate == own_rate)
+				supp_rates |= BIT(j);
+	}
+	return supp_rates;
 }
 
+/* frame sending functions */
+
+/* also used by scanning code */
 void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 			      u8 *ssid, size_t ssid_len)
 {
@@ -191,6 +193,43 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 	ieee80211_tx_skb(sdata, skb, 0);
 }
 
+static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_if_sta *ifsta,
+				int transaction, u8 *extra, size_t extra_len,
+				int encrypt)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(*mgmt) + 6 + extra_len);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
+	memset(mgmt, 0, 24 + 6);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_AUTH);
+	if (encrypt)
+		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
+	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
+	ifsta->auth_transaction = transaction + 1;
+	mgmt->u.auth.status_code = cpu_to_le16(0);
+	if (extra)
+		memcpy(skb_put(skb, extra_len), extra, extra_len);
+
+	ieee80211_tx_skb(sdata, skb, encrypt);
+}
+
 static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 				 struct ieee80211_if_sta *ifsta)
 {
@@ -1414,42 +1453,6 @@ static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	return res;
 }
 
-u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
-			    struct ieee802_11_elems *elems,
-			    enum ieee80211_band band)
-{
-	struct ieee80211_supported_band *sband;
-	struct ieee80211_rate *bitrates;
-	size_t num_rates;
-	u64 supp_rates;
-	int i, j;
-	sband = local->hw.wiphy->bands[band];
-
-	if (!sband) {
-		WARN_ON(1);
-		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-	}
-
-	bitrates = sband->bitrates;
-	num_rates = sband->n_bitrates;
-	supp_rates = 0;
-	for (i = 0; i < elems->supp_rates_len +
-		     elems->ext_supp_rates_len; i++) {
-		u8 rate = 0;
-		int own_rate;
-		if (i < elems->supp_rates_len)
-			rate = elems->supp_rates[i];
-		else if (elems->ext_supp_rates)
-			rate = elems->ext_supp_rates
-				[i - elems->supp_rates_len];
-		own_rate = 5 * (rate & 0x7f);
-		for (j = 0; j < num_rates; j++)
-			if (bitrates[j].bitrate == own_rate)
-				supp_rates |= BIT(j);
-	}
-	return supp_rates;
-}
-
 static u64 ieee80211_sta_get_mandatory_rates(struct ieee80211_local *local,
 					enum ieee80211_band band)
 {
@@ -1894,7 +1897,7 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
 }
 
 
-void ieee80211_sta_timer(unsigned long data)
+static void ieee80211_sta_timer(unsigned long data)
 {
 	struct ieee80211_sub_if_data *sdata =
 		(struct ieee80211_sub_if_data *) data;
@@ -1937,28 +1940,6 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
 }
 
 
-void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
-			    struct ieee80211_if_sta *ifsta)
-{
-	struct ieee80211_local *local = sdata->local;
-
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
-		return;
-
-	if ((ifsta->flags & (IEEE80211_STA_BSSID_SET |
-			     IEEE80211_STA_AUTO_BSSID_SEL)) &&
-	    (ifsta->flags & (IEEE80211_STA_SSID_SET |
-			     IEEE80211_STA_AUTO_SSID_SEL))) {
-
-		if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED)
-			ieee80211_set_disassoc(sdata, ifsta, true, true,
-					       WLAN_REASON_DEAUTH_LEAVING);
-
-		set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
-		queue_work(local->hw.workqueue, &ifsta->work);
-	}
-}
-
 static int ieee80211_sta_match_ssid(struct ieee80211_if_sta *ifsta,
 				    const char *ssid, int ssid_len)
 {
@@ -2160,113 +2141,190 @@ dont_join:
 }
 
 
-int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_if_sta *ifsta)
 {
-	struct ieee80211_if_sta *ifsta;
-	int res;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_sta_bss *bss, *selected = NULL;
+	int top_rssi = 0, freq;
 
-	if (len > IEEE80211_MAX_SSID_LEN)
-		return -EINVAL;
+	spin_lock_bh(&local->sta_bss_lock);
+	freq = local->oper_channel->center_freq;
+	list_for_each_entry(bss, &local->sta_bss_list, list) {
+		if (!(bss->capability & WLAN_CAPABILITY_ESS))
+			continue;
 
-	ifsta = &sdata->u.sta;
+		if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
+			IEEE80211_STA_AUTO_BSSID_SEL |
+			IEEE80211_STA_AUTO_CHANNEL_SEL)) &&
+		    (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
+		     !!sdata->default_key))
+			continue;
 
-	if (ifsta->ssid_len != len || memcmp(ifsta->ssid, ssid, len) != 0) {
-		memset(ifsta->ssid, 0, sizeof(ifsta->ssid));
-		memcpy(ifsta->ssid, ssid, len);
-		ifsta->ssid_len = len;
-		ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+		if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) &&
+		    bss->freq != freq)
+			continue;
 
-		res = 0;
-		/*
-		 * Hack! MLME code needs to be cleaned up to have different
-		 * entry points for configuration and internal selection change
-		 */
-		if (netif_running(sdata->dev))
-			res = ieee80211_if_config(sdata, IEEE80211_IFCC_SSID);
-		if (res) {
-			printk(KERN_DEBUG "%s: Failed to config new SSID to "
-			       "the low-level driver\n", sdata->dev->name);
-			return res;
-		}
-	}
+		if (!(ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) &&
+		    memcmp(bss->bssid, ifsta->bssid, ETH_ALEN))
+			continue;
 
-	if (len)
-		ifsta->flags |= IEEE80211_STA_SSID_SET;
-	else
-		ifsta->flags &= ~IEEE80211_STA_SSID_SET;
+		if (!(ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) &&
+		    !ieee80211_sta_match_ssid(ifsta, bss->ssid, bss->ssid_len))
+			continue;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
-	    !(ifsta->flags & IEEE80211_STA_BSSID_SET)) {
-		ifsta->ibss_join_req = jiffies;
-		ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
-		return ieee80211_sta_find_ibss(sdata, ifsta);
+		if (!selected || top_rssi < bss->signal) {
+			selected = bss;
+			top_rssi = bss->signal;
+		}
 	}
+	if (selected)
+		atomic_inc(&selected->users);
+	spin_unlock_bh(&local->sta_bss_lock);
 
-	return 0;
-}
-
-
-int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
-{
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	memcpy(ssid, ifsta->ssid, ifsta->ssid_len);
-	*len = ifsta->ssid_len;
-	return 0;
-}
-
+	if (selected) {
+		ieee80211_set_freq(sdata, selected->freq);
+		if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
+			ieee80211_sta_set_ssid(sdata, selected->ssid,
+					       selected->ssid_len);
+		ieee80211_sta_set_bssid(sdata, selected->bssid);
+		ieee80211_sta_def_wmm_params(sdata, selected);
 
-int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
-{
-	struct ieee80211_if_sta *ifsta;
-	int res;
+		/* Send out direct probe if no probe resp was received or
+		 * the one we have is outdated
+		 */
+		if (!selected->last_probe_resp ||
+		    time_after(jiffies, selected->last_probe_resp
+					+ IEEE80211_SCAN_RESULT_EXPIRE))
+			ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+		else
+			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 
+		ieee80211_rx_bss_put(local, selected);
+		ieee80211_sta_reset_auth(sdata, ifsta);
+		return 0;
+	} else {
+		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
+			ifsta->assoc_scan_tries++;
+			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
+				ieee80211_sta_start_scan(sdata, NULL, 0);
+			else
+				ieee80211_sta_start_scan(sdata, ifsta->ssid,
+							 ifsta->ssid_len);
+			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
+		} else
+			ifsta->state = IEEE80211_STA_MLME_DISABLED;
+	}
+	return -1;
+}
+
+
+static void ieee80211_sta_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data, u.sta.work);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_sta *ifsta;
+	struct sk_buff *skb;
+
+	if (!netif_running(sdata->dev))
+		return;
+
+	if (local->sta_sw_scanning || local->sta_hw_scanning)
+		return;
+
+	if (WARN_ON(sdata->vif.type != IEEE80211_IF_TYPE_STA &&
+		    sdata->vif.type != IEEE80211_IF_TYPE_IBSS))
+		return;
 	ifsta = &sdata->u.sta;
 
-	if (memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0) {
-		memcpy(ifsta->bssid, bssid, ETH_ALEN);
-		res = 0;
-		/*
-		 * Hack! See also ieee80211_sta_set_ssid.
-		 */
-		if (netif_running(sdata->dev))
-			res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
-		if (res) {
-			printk(KERN_DEBUG "%s: Failed to config new BSSID to "
-			       "the low-level driver\n", sdata->dev->name);
-			return res;
-		}
+	while ((skb = skb_dequeue(&ifsta->skb_queue)))
+		ieee80211_sta_rx_queued_mgmt(sdata, skb);
+
+	if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
+	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
+	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
+	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
+		ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
+		return;
 	}
 
-	if (is_valid_ether_addr(bssid))
-		ifsta->flags |= IEEE80211_STA_BSSID_SET;
-	else
-		ifsta->flags &= ~IEEE80211_STA_BSSID_SET;
+	if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) {
+		if (ieee80211_sta_config_auth(sdata, ifsta))
+			return;
+		clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
+	} else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request))
+		return;
 
-	return 0;
+	switch (ifsta->state) {
+	case IEEE80211_STA_MLME_DISABLED:
+		break;
+	case IEEE80211_STA_MLME_DIRECT_PROBE:
+		ieee80211_direct_probe(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_AUTHENTICATE:
+		ieee80211_authenticate(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_ASSOCIATE:
+		ieee80211_associate(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_ASSOCIATED:
+		ieee80211_associated(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_IBSS_SEARCH:
+		ieee80211_sta_find_ibss(sdata, ifsta);
+		break;
+	case IEEE80211_STA_MLME_IBSS_JOINED:
+		ieee80211_sta_merge_ibss(sdata, ifsta);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+
+	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
+		printk(KERN_DEBUG "%s: privacy configuration mismatch and "
+		       "mixed-cell disabled - disassociate\n", sdata->dev->name);
+
+		ieee80211_set_disassoc(sdata, ifsta, false, true,
+					WLAN_REASON_UNSPECIFIED);
+	}
 }
 
+static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
+{
+	if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
+		queue_work(sdata->local->hw.workqueue,
+			   &sdata->u.sta.work);
+}
 
-int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
+/* interface setup */
+void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_sta *ifsta;
 
-	kfree(ifsta->extra_ie);
-	if (len == 0) {
-		ifsta->extra_ie = NULL;
-		ifsta->extra_ie_len = 0;
-		return 0;
-	}
-	ifsta->extra_ie = kmalloc(len, GFP_KERNEL);
-	if (!ifsta->extra_ie) {
-		ifsta->extra_ie_len = 0;
-		return -ENOMEM;
-	}
-	memcpy(ifsta->extra_ie, ie, len);
-	ifsta->extra_ie_len = len;
-	return 0;
+	ifsta = &sdata->u.sta;
+	INIT_WORK(&ifsta->work, ieee80211_sta_work);
+	setup_timer(&ifsta->timer, ieee80211_sta_timer,
+		    (unsigned long) sdata);
+	skb_queue_head_init(&ifsta->skb_queue);
+
+	ifsta->capab = WLAN_CAPABILITY_ESS;
+	ifsta->auth_algs = IEEE80211_AUTH_ALG_OPEN |
+		IEEE80211_AUTH_ALG_SHARED_KEY;
+	ifsta->flags |= IEEE80211_STA_CREATE_IBSS |
+		IEEE80211_STA_AUTO_BSSID_SEL |
+		IEEE80211_STA_AUTO_CHANNEL_SEL;
+	if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4)
+		ifsta->flags |= IEEE80211_STA_WMM_ENABLED;
 }
 
-
+/*
+ * Add a new IBSS station, will also be called by the RX code when,
+ * in IBSS mode, receiving a frame from a yet-unknown station, hence
+ * must be callable in atomic context.
+ */
 struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 					struct sk_buff *skb, u8 *bssid,
 					u8 *addr, u64 supp_rates)
@@ -2312,86 +2370,132 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 	return sta;
 }
 
-
-static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta)
+/* configuration hooks */
+void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
+			    struct ieee80211_if_sta *ifsta)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_sta_bss *bss, *selected = NULL;
-	int top_rssi = 0, freq;
 
-	spin_lock_bh(&local->sta_bss_lock);
-	freq = local->oper_channel->center_freq;
-	list_for_each_entry(bss, &local->sta_bss_list, list) {
-		if (!(bss->capability & WLAN_CAPABILITY_ESS))
-			continue;
+	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+		return;
 
-		if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
-			IEEE80211_STA_AUTO_BSSID_SEL |
-			IEEE80211_STA_AUTO_CHANNEL_SEL)) &&
-		    (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
-		     !!sdata->default_key))
-			continue;
+	if ((ifsta->flags & (IEEE80211_STA_BSSID_SET |
+			     IEEE80211_STA_AUTO_BSSID_SEL)) &&
+	    (ifsta->flags & (IEEE80211_STA_SSID_SET |
+			     IEEE80211_STA_AUTO_SSID_SEL))) {
 
-		if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) &&
-		    bss->freq != freq)
-			continue;
+		if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED)
+			ieee80211_set_disassoc(sdata, ifsta, true, true,
+					       WLAN_REASON_DEAUTH_LEAVING);
 
-		if (!(ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) &&
-		    memcmp(bss->bssid, ifsta->bssid, ETH_ALEN))
-			continue;
+		set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
+		queue_work(local->hw.workqueue, &ifsta->work);
+	}
+}
 
-		if (!(ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) &&
-		    !ieee80211_sta_match_ssid(ifsta, bss->ssid, bss->ssid_len))
-			continue;
+int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+{
+	struct ieee80211_if_sta *ifsta;
+	int res;
 
-		if (!selected || top_rssi < bss->signal) {
-			selected = bss;
-			top_rssi = bss->signal;
+	if (len > IEEE80211_MAX_SSID_LEN)
+		return -EINVAL;
+
+	ifsta = &sdata->u.sta;
+
+	if (ifsta->ssid_len != len || memcmp(ifsta->ssid, ssid, len) != 0) {
+		memset(ifsta->ssid, 0, sizeof(ifsta->ssid));
+		memcpy(ifsta->ssid, ssid, len);
+		ifsta->ssid_len = len;
+		ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+
+		res = 0;
+		/*
+		 * Hack! MLME code needs to be cleaned up to have different
+		 * entry points for configuration and internal selection change
+		 */
+		if (netif_running(sdata->dev))
+			res = ieee80211_if_config(sdata, IEEE80211_IFCC_SSID);
+		if (res) {
+			printk(KERN_DEBUG "%s: Failed to config new SSID to "
+			       "the low-level driver\n", sdata->dev->name);
+			return res;
 		}
 	}
-	if (selected)
-		atomic_inc(&selected->users);
-	spin_unlock_bh(&local->sta_bss_lock);
 
-	if (selected) {
-		ieee80211_set_freq(sdata, selected->freq);
-		if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
-			ieee80211_sta_set_ssid(sdata, selected->ssid,
-					       selected->ssid_len);
-		ieee80211_sta_set_bssid(sdata, selected->bssid);
-		ieee80211_sta_def_wmm_params(sdata, selected);
+	if (len)
+		ifsta->flags |= IEEE80211_STA_SSID_SET;
+	else
+		ifsta->flags &= ~IEEE80211_STA_SSID_SET;
 
-		/* Send out direct probe if no probe resp was received or
-		 * the one we have is outdated
+	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+	    !(ifsta->flags & IEEE80211_STA_BSSID_SET)) {
+		ifsta->ibss_join_req = jiffies;
+		ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
+		return ieee80211_sta_find_ibss(sdata, ifsta);
+	}
+
+	return 0;
+}
+
+int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
+{
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	memcpy(ssid, ifsta->ssid, ifsta->ssid_len);
+	*len = ifsta->ssid_len;
+	return 0;
+}
+
+int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
+{
+	struct ieee80211_if_sta *ifsta;
+	int res;
+
+	ifsta = &sdata->u.sta;
+
+	if (memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0) {
+		memcpy(ifsta->bssid, bssid, ETH_ALEN);
+		res = 0;
+		/*
+		 * Hack! See also ieee80211_sta_set_ssid.
 		 */
-		if (!selected->last_probe_resp ||
-		    time_after(jiffies, selected->last_probe_resp
-					+ IEEE80211_SCAN_RESULT_EXPIRE))
-			ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
-		else
-			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+		if (netif_running(sdata->dev))
+			res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
+		if (res) {
+			printk(KERN_DEBUG "%s: Failed to config new BSSID to "
+			       "the low-level driver\n", sdata->dev->name);
+			return res;
+		}
+	}
 
-		ieee80211_rx_bss_put(local, selected);
-		ieee80211_sta_reset_auth(sdata, ifsta);
+	if (is_valid_ether_addr(bssid))
+		ifsta->flags |= IEEE80211_STA_BSSID_SET;
+	else
+		ifsta->flags &= ~IEEE80211_STA_BSSID_SET;
+
+	return 0;
+}
+
+int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
+{
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+
+	kfree(ifsta->extra_ie);
+	if (len == 0) {
+		ifsta->extra_ie = NULL;
+		ifsta->extra_ie_len = 0;
 		return 0;
-	} else {
-		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
-			ifsta->assoc_scan_tries++;
-			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
-				ieee80211_sta_start_scan(sdata, NULL, 0);
-			else
-				ieee80211_sta_start_scan(sdata, ifsta->ssid,
-							 ifsta->ssid_len);
-			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
-			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
-		} else
-			ifsta->state = IEEE80211_STA_MLME_DISABLED;
 	}
-	return -1;
+	ifsta->extra_ie = kmalloc(len, GFP_KERNEL);
+	if (!ifsta->extra_ie) {
+		ifsta->extra_ie_len = 0;
+		return -ENOMEM;
+	}
+	memcpy(ifsta->extra_ie, ie, len);
+	ifsta->extra_ie_len = len;
+	return 0;
 }
 
-
 int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason)
 {
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
@@ -2407,7 +2511,6 @@ int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason
 	return 0;
 }
 
-
 int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 {
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
@@ -2425,6 +2528,28 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 	return 0;
 }
 
+/* scan finished notification */
+void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
+	struct ieee80211_if_sta *ifsta;
+
+	if (sdata && sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+		ifsta = &sdata->u.sta;
+		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
+		    (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) &&
+		    !ieee80211_sta_active_ibss(sdata)))
+			ieee80211_sta_find_ibss(sdata, ifsta);
+	}
+
+	/* Restart STA timers */
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list)
+		ieee80211_restart_sta_timer(sdata);
+	rcu_read_unlock();
+}
+
+/* driver notification call */
 void ieee80211_notify_mac(struct ieee80211_hw *hw,
 			  enum ieee80211_notification_types  notif_type)
 {
@@ -2445,102 +2570,3 @@ void ieee80211_notify_mac(struct ieee80211_hw *hw,
 	}
 }
 EXPORT_SYMBOL(ieee80211_notify_mac);
-
-void ieee80211_sta_work(struct work_struct *work)
-{
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data, u.sta.work);
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta;
-	struct sk_buff *skb;
-
-	if (!netif_running(sdata->dev))
-		return;
-
-	if (local->sta_sw_scanning || local->sta_hw_scanning)
-		return;
-
-	if (WARN_ON(sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-		    sdata->vif.type != IEEE80211_IF_TYPE_IBSS))
-		return;
-	ifsta = &sdata->u.sta;
-
-	while ((skb = skb_dequeue(&ifsta->skb_queue)))
-		ieee80211_sta_rx_queued_mgmt(sdata, skb);
-
-	if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
-	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
-	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
-	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
-		ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
-		return;
-	}
-
-	if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) {
-		if (ieee80211_sta_config_auth(sdata, ifsta))
-			return;
-		clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
-	} else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request))
-		return;
-
-	switch (ifsta->state) {
-	case IEEE80211_STA_MLME_DISABLED:
-		break;
-	case IEEE80211_STA_MLME_DIRECT_PROBE:
-		ieee80211_direct_probe(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_AUTHENTICATE:
-		ieee80211_authenticate(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_ASSOCIATE:
-		ieee80211_associate(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_ASSOCIATED:
-		ieee80211_associated(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_IBSS_SEARCH:
-		ieee80211_sta_find_ibss(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_IBSS_JOINED:
-		ieee80211_sta_merge_ibss(sdata, ifsta);
-		break;
-	default:
-		WARN_ON(1);
-		break;
-	}
-
-	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
-		printk(KERN_DEBUG "%s: privacy configuration mismatch and "
-		       "mixed-cell disabled - disassociate\n", sdata->dev->name);
-
-		ieee80211_set_disassoc(sdata, ifsta, false, true,
-					WLAN_REASON_UNSPECIFIED);
-	}
-}
-
-static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
-{
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
-		queue_work(sdata->local->hw.workqueue,
-			   &sdata->u.sta.work);
-}
-
-void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
-{
-	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
-	struct ieee80211_if_sta *ifsta;
-
-	if (sdata && sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
-		ifsta = &sdata->u.sta;
-		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
-		    (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) &&
-		    !ieee80211_sta_active_ibss(sdata)))
-			ieee80211_sta_find_ibss(sdata, ifsta);
-	}
-
-	/* Restart STA timers */
-	rcu_read_lock();
-	list_for_each_entry_rcu(sdata, &local->interfaces, list)
-		ieee80211_restart_sta_timer(sdata);
-	rcu_read_unlock();
-}
-- 
cgit v1.2.3


From e16751c3178add97c4f83dcf92e59b536537b22f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:53 +0200
Subject: mac80211: move ieee80211_set_freq to utils

It really doesn't belong into the wireless extensions code.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/util.c        | 28 ++++++++++++++++++++++++++++
 net/mac80211/wext.c        | 28 ----------------------------
 3 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 442a43a3400..199d64143b4 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -893,7 +893,6 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 
 /* wireless extensions */
 extern const struct iw_handler_def ieee80211_iw_handler_def;
-int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freq);
 
 /* STA/IBSS code */
 void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
@@ -1008,6 +1007,7 @@ void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt);
 void ieee802_11_parse_elems(u8 *start, size_t len,
 			    struct ieee802_11_elems *elems);
+int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freq);
 
 #ifdef CONFIG_MAC80211_NOINLINE
 #define debug_noinline noinline
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index c3a22ab2ad2..cf0b820a0ea 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -612,3 +612,31 @@ void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 
 	dev_queue_xmit(skb);
 }
+
+int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
+{
+	int ret = -EINVAL;
+	struct ieee80211_channel *chan;
+	struct ieee80211_local *local = sdata->local;
+
+	chan = ieee80211_get_channel(local->hw.wiphy, freqMHz);
+
+	if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) {
+		if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+		    chan->flags & IEEE80211_CHAN_NO_IBSS) {
+			printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
+				"%d MHz\n", sdata->dev->name, chan->center_freq);
+			return ret;
+		}
+		local->oper_channel = chan;
+
+		if (local->sta_sw_scanning || local->sta_hw_scanning)
+			ret = 0;
+		else
+			ret = ieee80211_hw_config(local);
+
+		rate_control_clear(local);
+	}
+
+	return ret;
+}
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index beae664ab48..97d132811c8 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -330,34 +330,6 @@ static int ieee80211_ioctl_giwmode(struct net_device *dev,
 	return 0;
 }
 
-int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
-{
-	int ret = -EINVAL;
-	struct ieee80211_channel *chan;
-	struct ieee80211_local *local = sdata->local;
-
-	chan = ieee80211_get_channel(local->hw.wiphy, freqMHz);
-
-	if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
-		    chan->flags & IEEE80211_CHAN_NO_IBSS) {
-			printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
-				"%d MHz\n", sdata->dev->name, chan->center_freq);
-			return ret;
-		}
-		local->oper_channel = chan;
-
-		if (local->sta_sw_scanning || local->sta_hw_scanning)
-			ret = 0;
-		else
-			ret = ieee80211_hw_config(local);
-
-		rate_control_clear(local);
-	}
-
-	return ret;
-}
-
 static int ieee80211_ioctl_siwfreq(struct net_device *dev,
 				   struct iw_request_info *info,
 				   struct iw_freq *freq, char *extra)
-- 
cgit v1.2.3


From 213cd118cbb88b76ae48f92cfb7dbef9a83cca62 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:54 +0200
Subject: mac80211: make bridge_packets a virtual interface option

The bridge_packets configuration really should be per virtual
interface (theoretically per AP/VLAN, but this is much easier);
there currently is no way to set it yet though. Also invert
the option to "NO_BRIDGE_PACKETS" so the default is to bridge.

While at it, also document the flags properly.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/debugfs.c     |  4 ----
 net/mac80211/ieee80211_i.h | 29 +++++++++++++++++++----------
 net/mac80211/main.c        |  2 --
 net/mac80211/rx.c          |  5 +++--
 4 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index ee509f1109e..24ce5446331 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -51,8 +51,6 @@ DEBUGFS_READONLY_FILE(antenna_sel_tx, 20, "%d",
 		      local->hw.conf.antenna_sel_tx);
 DEBUGFS_READONLY_FILE(antenna_sel_rx, 20, "%d",
 		      local->hw.conf.antenna_sel_rx);
-DEBUGFS_READONLY_FILE(bridge_packets, 20, "%d",
-		      local->bridge_packets);
 DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
 		      local->rts_threshold);
 DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
@@ -206,7 +204,6 @@ void debugfs_hw_add(struct ieee80211_local *local)
 	DEBUGFS_ADD(frequency);
 	DEBUGFS_ADD(antenna_sel_tx);
 	DEBUGFS_ADD(antenna_sel_rx);
-	DEBUGFS_ADD(bridge_packets);
 	DEBUGFS_ADD(rts_threshold);
 	DEBUGFS_ADD(fragmentation_threshold);
 	DEBUGFS_ADD(short_retry_limit);
@@ -263,7 +260,6 @@ void debugfs_hw_del(struct ieee80211_local *local)
 	DEBUGFS_DEL(frequency);
 	DEBUGFS_DEL(antenna_sel_tx);
 	DEBUGFS_DEL(antenna_sel_rx);
-	DEBUGFS_DEL(bridge_packets);
 	DEBUGFS_DEL(rts_threshold);
 	DEBUGFS_DEL(fragmentation_threshold);
 	DEBUGFS_DEL(short_retry_limit);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 199d64143b4..52f36ab1ee5 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -402,11 +402,25 @@ struct ieee80211_if_mesh {
 	do { } while (0)
 #endif
 
-/* flags used in struct ieee80211_sub_if_data.flags */
-#define IEEE80211_SDATA_ALLMULTI	BIT(0)
-#define IEEE80211_SDATA_PROMISC		BIT(1)
-#define IEEE80211_SDATA_USERSPACE_MLME	BIT(2)
-#define IEEE80211_SDATA_OPERATING_GMODE	BIT(3)
+/**
+ * enum ieee80211_sub_if_data_flags - virtual interface flags
+ *
+ * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets
+ * @IEEE80211_SDATA_PROMISC: interface is promisc
+ * @IEEE80211_SDATA_USERSPACE_MLME: userspace MLME is active
+ * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode
+ * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between
+ *	associated stations and deliver multicast frames both
+ *	back to wireless media and to the local net stack.
+ */
+enum ieee80211_sub_if_data_flags {
+	IEEE80211_SDATA_ALLMULTI		= BIT(0),
+	IEEE80211_SDATA_PROMISC			= BIT(1),
+	IEEE80211_SDATA_USERSPACE_MLME		= BIT(2),
+	IEEE80211_SDATA_OPERATING_GMODE		= BIT(3),
+	IEEE80211_SDATA_DONT_BRIDGE_PACKETS	= BIT(4),
+};
+
 struct ieee80211_sub_if_data {
 	struct list_head list;
 
@@ -635,10 +649,6 @@ struct ieee80211_local {
 	struct crypto_blkcipher *wep_rx_tfm;
 	u32 wep_iv;
 
-	int bridge_packets; /* bridge packets between associated stations and
-			     * deliver multicast frames both back to wireless
-			     * media and to the local net stack */
-
 	struct list_head interfaces;
 
 	/*
@@ -726,7 +736,6 @@ struct ieee80211_local {
 		struct dentry *frequency;
 		struct dentry *antenna_sel_tx;
 		struct dentry *antenna_sel_rx;
-		struct dentry *bridge_packets;
 		struct dentry *rts_threshold;
 		struct dentry *fragmentation_threshold;
 		struct dentry *short_retry_limit;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4bfac4b41c5..72e3f5574e9 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1280,8 +1280,6 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 
 	local->hw.queues = 1; /* default */
 
-	local->bridge_packets = 1;
-
 	local->rts_threshold = IEEE80211_MAX_RTS_THRESHOLD;
 	local->fragmentation_threshold = IEEE80211_MAX_FRAG_THRESHOLD;
 	local->short_retry_limit = 7;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 208563a27bc..93f2cda9926 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1221,8 +1221,9 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 	skb = rx->skb;
 	xmit_skb = NULL;
 
-	if (local->bridge_packets && (sdata->vif.type == IEEE80211_IF_TYPE_AP ||
-				      sdata->vif.type == IEEE80211_IF_TYPE_VLAN) &&
+	if ((sdata->vif.type == IEEE80211_IF_TYPE_AP ||
+	     sdata->vif.type == IEEE80211_IF_TYPE_VLAN) &&
+	    !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
 	    (rx->flags & IEEE80211_RX_RA_MATCH)) {
 		if (is_multicast_ether_addr(ehdr->h_dest)) {
 			/*
-- 
cgit v1.2.3


From c2b13452b283f9c4a5b02a6b53ed6416ebf4c03c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:55 +0200
Subject: mac80211: clean up scan namespace

Most of the scan functions are called ieee80211_sta_scan_*
or similar, make clean it up so they are all just called
ieee80211_scan_*.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  53 ++++++++--------
 net/mac80211/main.c        |   6 +-
 net/mac80211/mesh.c        |   2 +-
 net/mac80211/mlme.c        |  51 +++++++--------
 net/mac80211/rx.c          |  10 +--
 net/mac80211/scan.c        | 153 +++++++++++++++++++++++----------------------
 net/mac80211/tx.c          |   2 +-
 net/mac80211/util.c        |   2 +-
 net/mac80211/wext.c        |   6 +-
 9 files changed, 145 insertions(+), 140 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 52f36ab1ee5..5c38ea08454 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -71,9 +71,9 @@ struct ieee80211_fragment_entry {
 };
 
 
-struct ieee80211_sta_bss {
+struct ieee80211_bss {
 	struct list_head list;
-	struct ieee80211_sta_bss *hnext;
+	struct ieee80211_bss *hnext;
 	size_t ssid_len;
 
 	atomic_t users;
@@ -112,7 +112,7 @@ struct ieee80211_sta_bss {
 	u8 erp_value;
 };
 
-static inline u8 *bss_mesh_cfg(struct ieee80211_sta_bss *bss)
+static inline u8 *bss_mesh_cfg(struct ieee80211_bss *bss)
 {
 #ifdef CONFIG_MAC80211_MESH
 	return bss->mesh_cfg;
@@ -120,7 +120,7 @@ static inline u8 *bss_mesh_cfg(struct ieee80211_sta_bss *bss)
 	return NULL;
 }
 
-static inline u8 *bss_mesh_id(struct ieee80211_sta_bss *bss)
+static inline u8 *bss_mesh_id(struct ieee80211_bss *bss)
 {
 #ifdef CONFIG_MAC80211_MESH
 	return bss->mesh_id;
@@ -128,7 +128,7 @@ static inline u8 *bss_mesh_id(struct ieee80211_sta_bss *bss)
 	return NULL;
 }
 
-static inline u8 bss_mesh_id_len(struct ieee80211_sta_bss *bss)
+static inline u8 bss_mesh_id_len(struct ieee80211_bss *bss)
 {
 #ifdef CONFIG_MAC80211_MESH
 	return bss->mesh_id_len;
@@ -658,8 +658,8 @@ struct ieee80211_local {
 	spinlock_t key_lock;
 
 
-	bool sta_sw_scanning;
-	bool sta_hw_scanning;
+	/* Scanning and BSS list */
+	bool sw_scanning, hw_scanning;
 	int scan_channel_idx;
 	enum ieee80211_band scan_band;
 
@@ -670,9 +670,9 @@ struct ieee80211_local {
 	struct ieee80211_channel *oper_channel, *scan_channel;
 	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
 	size_t scan_ssid_len;
-	struct list_head sta_bss_list;
-	struct ieee80211_sta_bss *sta_bss_hash[STA_HASH_SIZE];
-	spinlock_t sta_bss_lock;
+	struct list_head bss_list;
+	struct ieee80211_bss *bss_hash[STA_HASH_SIZE];
+	spinlock_t bss_lock;
 
 	/* SNMP counters */
 	/* dot11CountersTable */
@@ -905,7 +905,7 @@ extern const struct iw_handler_def ieee80211_iw_handler_def;
 
 /* STA/IBSS code */
 void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
-void ieee80211_sta_scan_work(struct work_struct *work);
+void ieee80211_scan_work(struct work_struct *work);
 void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 			   struct ieee80211_rx_status *rx_status);
 int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
@@ -926,35 +926,38 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 			      u8 *ssid, size_t ssid_len);
 
 /* scan/BSS handling */
-int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len);
-int ieee80211_sta_scan_results(struct ieee80211_local *local,
-			       struct iw_request_info *info,
-			       char *buf, size_t len);
-ieee80211_rx_result ieee80211_sta_rx_scan(
-	struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-	struct ieee80211_rx_status *rx_status);
+int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
+			   u8 *ssid, size_t ssid_len);
+int ieee80211_scan_results(struct ieee80211_local *local,
+			   struct iw_request_info *info,
+			   char *buf, size_t len);
+ieee80211_rx_result
+ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata,
+		  struct sk_buff *skb,
+		  struct ieee80211_rx_status *rx_status);
 void ieee80211_rx_bss_list_init(struct ieee80211_local *local);
 void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local);
-int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len);
+int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata,
+			       char *ie, size_t len);
 
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
-int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
-			     u8 *ssid, size_t ssid_len);
-struct ieee80211_sta_bss *
+int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
+			 u8 *ssid, size_t ssid_len);
+struct ieee80211_bss *
 ieee80211_bss_info_update(struct ieee80211_local *local,
 			  struct ieee80211_rx_status *rx_status,
 			  struct ieee80211_mgmt *mgmt,
 			  size_t len,
 			  struct ieee802_11_elems *elems,
 			  int freq, bool beacon);
-struct ieee80211_sta_bss *
+struct ieee80211_bss *
 ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len);
-struct ieee80211_sta_bss *
+struct ieee80211_bss *
 ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len);
 void ieee80211_rx_bss_put(struct ieee80211_local *local,
-			  struct ieee80211_sta_bss *bss);
+			  struct ieee80211_bss *bss);
 
 /* interface handling */
 void ieee80211_if_setup(struct net_device *dev);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 72e3f5574e9..4c424acc01a 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -598,7 +598,7 @@ static int ieee80211_stop(struct net_device *dev)
 			 * the scan_sdata is NULL already don't send out a
 			 * scan event to userspace -- the scan is incomplete.
 			 */
-			if (local->sta_sw_scanning)
+			if (local->sw_scanning)
 				ieee80211_scan_completed(&local->hw);
 		}
 
@@ -732,7 +732,7 @@ int ieee80211_hw_config(struct ieee80211_local *local)
 	struct ieee80211_channel *chan;
 	int ret = 0;
 
-	if (local->sta_sw_scanning)
+	if (local->sw_scanning)
 		chan = local->scan_channel;
 	else
 		chan = local->oper_channel;
@@ -1290,7 +1290,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 
 	spin_lock_init(&local->key_lock);
 
-	INIT_DELAYED_WORK(&local->scan_work, ieee80211_sta_scan_work);
+	INIT_DELAYED_WORK(&local->scan_work, ieee80211_scan_work);
 
 	sta_info_init(local);
 
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index a0141f5ff18..30cf891fd3a 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -566,7 +566,7 @@ static void ieee80211_mesh_work(struct work_struct *work)
 	if (!netif_running(sdata->dev))
 		return;
 
-	if (local->sta_sw_scanning || local->sta_hw_scanning)
+	if (local->sw_scanning || local->hw_scanning)
 		return;
 
 	while ((skb = skb_dequeue(&ifmsh->skb_queue)))
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e1483010652..5b748447eb7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -52,7 +52,7 @@ static int ecw2cw(int ecw)
 	return (1 << ecw) - 1;
 }
 
-static u8 *ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
+static u8 *ieee80211_bss_get_ie(struct ieee80211_bss *bss, u8 ie)
 {
 	u8 *end, *pos;
 
@@ -72,7 +72,7 @@ static u8 *ieee80211_bss_get_ie(struct ieee80211_sta_bss *bss, u8 ie)
 	return NULL;
 }
 
-static int ieee80211_compatible_rates(struct ieee80211_sta_bss *bss,
+static int ieee80211_compatible_rates(struct ieee80211_bss *bss,
 				      struct ieee80211_supported_band *sband,
 				      u64 *rates)
 {
@@ -239,7 +239,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	u8 *pos, *ies, *ht_add_ie;
 	int i, len, count, rates_len, supp_rates_len;
 	u16 capab;
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 	int wmm = 0;
 	struct ieee80211_supported_band *sband;
 	u64 rates = 0;
@@ -470,7 +470,7 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 
 /* MLME */
 static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
-					 struct ieee80211_sta_bss *bss)
+					 struct ieee80211_bss *bss)
 {
 	struct ieee80211_local *local = sdata->local;
 	int i, have_higher_than_11mbit = 0;
@@ -621,7 +621,7 @@ static u32 ieee80211_handle_erp_ie(struct ieee80211_sub_if_data *sdata,
 }
 
 static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
-					   struct ieee80211_sta_bss *bss)
+					   struct ieee80211_bss *bss)
 {
 	u32 changed = 0;
 
@@ -674,7 +674,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_conf *conf = &local_to_hw(local)->conf;
 	u32 changed = BSS_CHANGED_ASSOC;
 
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 
 	ifsta->flags |= IEEE80211_STA_ASSOCIATED;
 
@@ -846,7 +846,7 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_if_sta *ifsta)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 	int bss_privacy;
 	int wep_privacy;
 	int privacy_invoked;
@@ -1219,7 +1219,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	/* Add STA entry for the AP */
 	sta = sta_info_get(local, ifsta->bssid);
 	if (!sta) {
-		struct ieee80211_sta_bss *bss;
+		struct ieee80211_bss *bss;
 		int err;
 
 		sta = sta_info_alloc(sdata, ifsta->bssid, GFP_ATOMIC);
@@ -1339,7 +1339,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 
 static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta,
-				   struct ieee80211_sta_bss *bss)
+				   struct ieee80211_bss *bss)
 {
 	struct ieee80211_local *local = sdata->local;
 	int res, rates, i, j;
@@ -1490,7 +1490,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 	int freq;
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 	struct sta_info *sta;
 	struct ieee80211_channel *channel;
 	u64 beacon_timestamp, rx_timestamp;
@@ -1893,7 +1893,7 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
 
 	printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
 	       "IBSS networks with same SSID (merge)\n", sdata->dev->name);
-	ieee80211_sta_req_scan(sdata, ifsta->ssid, ifsta->ssid_len);
+	ieee80211_request_scan(sdata, ifsta->ssid, ifsta->ssid_len);
 }
 
 
@@ -1974,7 +1974,7 @@ static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 	struct ieee80211_supported_band *sband;
 	u8 bssid[ETH_ALEN], *pos;
 	int i;
@@ -2035,7 +2035,7 @@ static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
 				   struct ieee80211_if_sta *ifsta)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 	int found = 0;
 	u8 bssid[ETH_ALEN];
 	int active_ibss;
@@ -2050,8 +2050,8 @@ static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
 	printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
 	       sdata->dev->name, active_ibss);
 #endif /* CONFIG_MAC80211_IBSS_DEBUG */
-	spin_lock_bh(&local->sta_bss_lock);
-	list_for_each_entry(bss, &local->sta_bss_list, list) {
+	spin_lock_bh(&local->bss_lock);
+	list_for_each_entry(bss, &local->bss_list, list) {
 		if (ifsta->ssid_len != bss->ssid_len ||
 		    memcmp(ifsta->ssid, bss->ssid, bss->ssid_len) != 0
 		    || !(bss->capability & WLAN_CAPABILITY_IBSS))
@@ -2065,7 +2065,7 @@ static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
 		if (active_ibss || memcmp(bssid, ifsta->bssid, ETH_ALEN) != 0)
 			break;
 	}
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 	if (found)
@@ -2110,7 +2110,7 @@ dont_join:
 			      IEEE80211_SCAN_INTERVAL)) {
 		printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
 		       "join\n", sdata->dev->name);
-		return ieee80211_sta_req_scan(sdata, ifsta->ssid,
+		return ieee80211_request_scan(sdata, ifsta->ssid,
 					      ifsta->ssid_len);
 	} else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) {
 		int interval = IEEE80211_SCAN_INTERVAL;
@@ -2145,12 +2145,12 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_if_sta *ifsta)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_sta_bss *bss, *selected = NULL;
+	struct ieee80211_bss *bss, *selected = NULL;
 	int top_rssi = 0, freq;
 
-	spin_lock_bh(&local->sta_bss_lock);
+	spin_lock_bh(&local->bss_lock);
 	freq = local->oper_channel->center_freq;
-	list_for_each_entry(bss, &local->sta_bss_list, list) {
+	list_for_each_entry(bss, &local->bss_list, list) {
 		if (!(bss->capability & WLAN_CAPABILITY_ESS))
 			continue;
 
@@ -2180,7 +2180,7 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 	}
 	if (selected)
 		atomic_inc(&selected->users);
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 
 	if (selected) {
 		ieee80211_set_freq(sdata, selected->freq);
@@ -2207,9 +2207,9 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
 			ifsta->assoc_scan_tries++;
 			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
-				ieee80211_sta_start_scan(sdata, NULL, 0);
+				ieee80211_start_scan(sdata, NULL, 0);
 			else
-				ieee80211_sta_start_scan(sdata, ifsta->ssid,
+				ieee80211_start_scan(sdata, ifsta->ssid,
 							 ifsta->ssid_len);
 			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
@@ -2231,7 +2231,7 @@ static void ieee80211_sta_work(struct work_struct *work)
 	if (!netif_running(sdata->dev))
 		return;
 
-	if (local->sta_sw_scanning || local->sta_hw_scanning)
+	if (local->sw_scanning || local->hw_scanning)
 		return;
 
 	if (WARN_ON(sdata->vif.type != IEEE80211_IF_TYPE_STA &&
@@ -2246,7 +2246,8 @@ static void ieee80211_sta_work(struct work_struct *work)
 	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
 	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
 	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
-		ieee80211_sta_start_scan(sdata, ifsta->scan_ssid, ifsta->scan_ssid_len);
+		ieee80211_start_scan(sdata, ifsta->scan_ssid,
+				     ifsta->scan_ssid_len);
 		return;
 	}
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 93f2cda9926..582396a4fdb 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -403,12 +403,12 @@ ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx)
 	struct ieee80211_local *local = rx->local;
 	struct sk_buff *skb = rx->skb;
 
-	if (unlikely(local->sta_hw_scanning))
-		return ieee80211_sta_rx_scan(rx->sdata, skb, rx->status);
+	if (unlikely(local->hw_scanning))
+		return ieee80211_scan_rx(rx->sdata, skb, rx->status);
 
-	if (unlikely(local->sta_sw_scanning)) {
+	if (unlikely(local->sw_scanning)) {
 		/* drop all the other packets during a software scan anyway */
-		if (ieee80211_sta_rx_scan(rx->sdata, skb, rx->status)
+		if (ieee80211_scan_rx(rx->sdata, skb, rx->status)
 		    != RX_QUEUED)
 			dev_kfree_skb(skb);
 		return RX_QUEUED;
@@ -1918,7 +1918,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 		return;
 	}
 
-	if (unlikely(local->sta_sw_scanning || local->sta_hw_scanning))
+	if (unlikely(local->sw_scanning || local->hw_scanning))
 		rx.flags |= IEEE80211_RX_IN_SCAN;
 
 	ieee80211_parse_qos(&rx);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 27727027d76..5e719e7b720 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -32,26 +32,26 @@
 
 void ieee80211_rx_bss_list_init(struct ieee80211_local *local)
 {
-	spin_lock_init(&local->sta_bss_lock);
-	INIT_LIST_HEAD(&local->sta_bss_list);
+	spin_lock_init(&local->bss_lock);
+	INIT_LIST_HEAD(&local->bss_list);
 }
 
 void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local)
 {
-	struct ieee80211_sta_bss *bss, *tmp;
+	struct ieee80211_bss *bss, *tmp;
 
-	list_for_each_entry_safe(bss, tmp, &local->sta_bss_list, list)
+	list_for_each_entry_safe(bss, tmp, &local->bss_list, list)
 		ieee80211_rx_bss_put(local, bss);
 }
 
-struct ieee80211_sta_bss *
+struct ieee80211_bss *
 ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len)
 {
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 
-	spin_lock_bh(&local->sta_bss_lock);
-	bss = local->sta_bss_hash[STA_HASH(bssid)];
+	spin_lock_bh(&local->bss_lock);
+	bss = local->bss_hash[STA_HASH(bssid)];
 	while (bss) {
 		if (!bss_mesh_cfg(bss) &&
 		    !memcmp(bss->bssid, bssid, ETH_ALEN) &&
@@ -63,13 +63,13 @@ ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
 		}
 		bss = bss->hnext;
 	}
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 	return bss;
 }
 
-/* Caller must hold local->sta_bss_lock */
+/* Caller must hold local->bss_lock */
 static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local,
-					struct ieee80211_sta_bss *bss)
+					struct ieee80211_bss *bss)
 {
 	u8 hash_idx;
 
@@ -79,20 +79,20 @@ static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local,
 	else
 		hash_idx = STA_HASH(bss->bssid);
 
-	bss->hnext = local->sta_bss_hash[hash_idx];
-	local->sta_bss_hash[hash_idx] = bss;
+	bss->hnext = local->bss_hash[hash_idx];
+	local->bss_hash[hash_idx] = bss;
 }
 
-/* Caller must hold local->sta_bss_lock */
+/* Caller must hold local->bss_lock */
 static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
-					struct ieee80211_sta_bss *bss)
+					struct ieee80211_bss *bss)
 {
-	struct ieee80211_sta_bss *b, *prev = NULL;
-	b = local->sta_bss_hash[STA_HASH(bss->bssid)];
+	struct ieee80211_bss *b, *prev = NULL;
+	b = local->bss_hash[STA_HASH(bss->bssid)];
 	while (b) {
 		if (b == bss) {
 			if (!prev)
-				local->sta_bss_hash[STA_HASH(bss->bssid)] =
+				local->bss_hash[STA_HASH(bss->bssid)] =
 					bss->hnext;
 			else
 				prev->hnext = bss->hnext;
@@ -103,11 +103,11 @@ static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local,
 	}
 }
 
-struct ieee80211_sta_bss *
+struct ieee80211_bss *
 ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len)
 {
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 
 	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
 	if (!bss)
@@ -120,23 +120,23 @@ ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
 		bss->ssid_len = ssid_len;
 	}
 
-	spin_lock_bh(&local->sta_bss_lock);
+	spin_lock_bh(&local->bss_lock);
 	/* TODO: order by RSSI? */
-	list_add_tail(&bss->list, &local->sta_bss_list);
+	list_add_tail(&bss->list, &local->bss_list);
 	__ieee80211_rx_bss_hash_add(local, bss);
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 	return bss;
 }
 
 #ifdef CONFIG_MAC80211_MESH
-static struct ieee80211_sta_bss *
+static struct ieee80211_bss *
 ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
 			  u8 *mesh_cfg, int freq)
 {
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 
-	spin_lock_bh(&local->sta_bss_lock);
-	bss = local->sta_bss_hash[mesh_id_hash(mesh_id, mesh_id_len)];
+	spin_lock_bh(&local->bss_lock);
+	bss = local->bss_hash[mesh_id_hash(mesh_id, mesh_id_len)];
 	while (bss) {
 		if (bss_mesh_cfg(bss) &&
 		    !memcmp(bss_mesh_cfg(bss), mesh_cfg, MESH_CFG_CMP_LEN) &&
@@ -149,15 +149,15 @@ ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_i
 		}
 		bss = bss->hnext;
 	}
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 	return bss;
 }
 
-static struct ieee80211_sta_bss *
+static struct ieee80211_bss *
 ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len,
 			  u8 *mesh_cfg, int mesh_config_len, int freq)
 {
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 
 	if (mesh_config_len != MESH_CFG_LEN)
 		return NULL;
@@ -186,16 +186,16 @@ ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_i
 	memcpy(bss->mesh_cfg, mesh_cfg, MESH_CFG_CMP_LEN);
 	bss->mesh_id_len = mesh_id_len;
 	bss->freq = freq;
-	spin_lock_bh(&local->sta_bss_lock);
+	spin_lock_bh(&local->bss_lock);
 	/* TODO: order by RSSI? */
-	list_add_tail(&bss->list, &local->sta_bss_list);
+	list_add_tail(&bss->list, &local->bss_list);
 	__ieee80211_rx_bss_hash_add(local, bss);
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 	return bss;
 }
 #endif
 
-static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
+static void ieee80211_rx_bss_free(struct ieee80211_bss *bss)
 {
 	kfree(bss->ies);
 	kfree(bss_mesh_id(bss));
@@ -204,21 +204,21 @@ static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss)
 }
 
 void ieee80211_rx_bss_put(struct ieee80211_local *local,
-			  struct ieee80211_sta_bss *bss)
+			  struct ieee80211_bss *bss)
 {
 	local_bh_disable();
-	if (!atomic_dec_and_lock(&bss->users, &local->sta_bss_lock)) {
+	if (!atomic_dec_and_lock(&bss->users, &local->bss_lock)) {
 		local_bh_enable();
 		return;
 	}
 
 	__ieee80211_rx_bss_hash_del(local, bss);
 	list_del(&bss->list);
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 	ieee80211_rx_bss_free(bss);
 }
 
-struct ieee80211_sta_bss *
+struct ieee80211_bss *
 ieee80211_bss_info_update(struct ieee80211_local *local,
 			  struct ieee80211_rx_status *rx_status,
 			  struct ieee80211_mgmt *mgmt,
@@ -226,7 +226,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 			  struct ieee802_11_elems *elems,
 			  int freq, bool beacon)
 {
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 	int clen;
 
 #ifdef CONFIG_MAC80211_MESH
@@ -252,9 +252,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 	} else {
 #if 0
 		/* TODO: order by RSSI? */
-		spin_lock_bh(&local->sta_bss_lock);
-		list_move_tail(&bss->list, &local->sta_bss_list);
-		spin_unlock_bh(&local->sta_bss_lock);
+		spin_lock_bh(&local->bss_lock);
+		list_move_tail(&bss->list, &local->bss_list);
+		spin_unlock_bh(&local->bss_lock);
 #endif
 	}
 
@@ -327,11 +327,11 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 }
 
 ieee80211_rx_result
-ieee80211_sta_rx_scan(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-		      struct ieee80211_rx_status *rx_status)
+ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		  struct ieee80211_rx_status *rx_status)
 {
 	struct ieee80211_mgmt *mgmt;
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 	u8 *elements;
 	struct ieee80211_channel *channel;
 	size_t baselen;
@@ -430,7 +430,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 	struct ieee80211_sub_if_data *sdata;
 	union iwreq_data wrqu;
 
-	if (WARN_ON(!local->sta_hw_scanning && !local->sta_sw_scanning))
+	if (WARN_ON(!local->hw_scanning && !local->sw_scanning))
 		return;
 
 	local->last_scan_completed = jiffies;
@@ -445,8 +445,8 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 	if (sdata)
 		wireless_send_event(sdata->dev, SIOCGIWSCAN, &wrqu, NULL);
 
-	if (local->sta_hw_scanning) {
-		local->sta_hw_scanning = 0;
+	if (local->hw_scanning) {
+		local->hw_scanning = false;
 		if (ieee80211_hw_config(local))
 			printk(KERN_DEBUG "%s: failed to restore operational "
 			       "channel after scan\n", wiphy_name(local->hw.wiphy));
@@ -454,7 +454,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 		goto done;
 	}
 
-	local->sta_sw_scanning = 0;
+	local->sw_scanning = false;
 	if (ieee80211_hw_config(local))
 		printk(KERN_DEBUG "%s: failed to restore operational "
 		       "channel after scan\n", wiphy_name(local->hw.wiphy));
@@ -492,7 +492,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 EXPORT_SYMBOL(ieee80211_scan_completed);
 
 
-void ieee80211_sta_scan_work(struct work_struct *work)
+void ieee80211_scan_work(struct work_struct *work)
 {
 	struct ieee80211_local *local =
 		container_of(work, struct ieee80211_local, scan_work.work);
@@ -589,8 +589,8 @@ void ieee80211_sta_scan_work(struct work_struct *work)
 }
 
 
-int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
-			     u8 *ssid, size_t ssid_len)
+int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
+			 u8 *ssid, size_t ssid_len)
 {
 	struct ieee80211_local *local = scan_sdata->local;
 	struct ieee80211_sub_if_data *sdata;
@@ -615,7 +615,7 @@ int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 	  * ResultCode: SUCCESS, INVALID_PARAMETERS
 	 */
 
-	if (local->sta_sw_scanning || local->sta_hw_scanning) {
+	if (local->sw_scanning || local->hw_scanning) {
 		if (local->scan_sdata == scan_sdata)
 			return 0;
 		return -EBUSY;
@@ -624,17 +624,17 @@ int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 	if (local->ops->hw_scan) {
 		int rc;
 
-		local->sta_hw_scanning = 1;
+		local->hw_scanning = true;
 		rc = local->ops->hw_scan(local_to_hw(local), ssid, ssid_len);
 		if (rc) {
-			local->sta_hw_scanning = 0;
+			local->hw_scanning = false;
 			return rc;
 		}
 		local->scan_sdata = scan_sdata;
 		return 0;
 	}
 
-	local->sta_sw_scanning = 1;
+	local->sw_scanning = true;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
@@ -675,13 +675,14 @@ int ieee80211_sta_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 }
 
 
-int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t ssid_len)
+int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
+			   u8 *ssid, size_t ssid_len)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta;
 
 	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
-		return ieee80211_sta_start_scan(sdata, ssid, ssid_len);
+		return ieee80211_start_scan(sdata, ssid, ssid_len);
 
 	/*
 	 * STA has a state machine that might need to defer scanning
@@ -689,7 +690,7 @@ int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t
 	 * queue it up to the state machine in that case.
 	 */
 
-	if (local->sta_sw_scanning || local->sta_hw_scanning) {
+	if (local->sw_scanning || local->hw_scanning) {
 		if (local->scan_sdata == sdata)
 			return 0;
 		return -EBUSY;
@@ -707,9 +708,9 @@ int ieee80211_sta_req_scan(struct ieee80211_sub_if_data *sdata, u8 *ssid, size_t
 }
 
 
-static void ieee80211_sta_add_scan_ies(struct iw_request_info *info,
-				       struct ieee80211_sta_bss *bss,
-				       char **current_ev, char *end_buf)
+static void ieee80211_scan_add_ies(struct iw_request_info *info,
+				   struct ieee80211_bss *bss,
+				   char **current_ev, char *end_buf)
 {
 	u8 *pos, *end, *next;
 	struct iw_event iwe;
@@ -749,10 +750,10 @@ static void ieee80211_sta_add_scan_ies(struct iw_request_info *info,
 
 
 static char *
-ieee80211_sta_scan_result(struct ieee80211_local *local,
-			  struct iw_request_info *info,
-			  struct ieee80211_sta_bss *bss,
-			  char *current_ev, char *end_buf)
+ieee80211_scan_result(struct ieee80211_local *local,
+		      struct iw_request_info *info,
+		      struct ieee80211_bss *bss,
+		      char *current_ev, char *end_buf)
 {
 	struct iw_event iwe;
 	char *buf;
@@ -828,7 +829,7 @@ ieee80211_sta_scan_result(struct ieee80211_local *local,
 	current_ev = iwe_stream_add_point(info, current_ev, end_buf,
 					  &iwe, "");
 
-	ieee80211_sta_add_scan_ies(info, bss, &current_ev, end_buf);
+	ieee80211_scan_add_ies(info, bss, &current_ev, end_buf);
 
 	if (bss->supp_rates_len > 0) {
 		/* display all supported rates in readable format */
@@ -914,23 +915,23 @@ ieee80211_sta_scan_result(struct ieee80211_local *local,
 }
 
 
-int ieee80211_sta_scan_results(struct ieee80211_local *local,
-			       struct iw_request_info *info,
-			       char *buf, size_t len)
+int ieee80211_scan_results(struct ieee80211_local *local,
+			   struct iw_request_info *info,
+			   char *buf, size_t len)
 {
 	char *current_ev = buf;
 	char *end_buf = buf + len;
-	struct ieee80211_sta_bss *bss;
+	struct ieee80211_bss *bss;
 
-	spin_lock_bh(&local->sta_bss_lock);
-	list_for_each_entry(bss, &local->sta_bss_list, list) {
+	spin_lock_bh(&local->bss_lock);
+	list_for_each_entry(bss, &local->bss_list, list) {
 		if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
-			spin_unlock_bh(&local->sta_bss_lock);
+			spin_unlock_bh(&local->bss_lock);
 			return -E2BIG;
 		}
-		current_ev = ieee80211_sta_scan_result(local, info, bss,
+		current_ev = ieee80211_scan_result(local, info, bss,
 						       current_ev, end_buf);
 	}
-	spin_unlock_bh(&local->sta_bss_lock);
+	spin_unlock_bh(&local->bss_lock);
 	return current_ev - buf;
 }
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1059b17c83b..e606ba08ddd 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -222,7 +222,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 	if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED))
 		return TX_CONTINUE;
 
-	if (unlikely(tx->local->sta_sw_scanning) &&
+	if (unlikely(tx->local->sw_scanning) &&
 	    !ieee80211_is_probe_req(hdr->frame_control))
 		return TX_DROP;
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index cf0b820a0ea..a7968df9dac 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -630,7 +630,7 @@ int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
 		}
 		local->oper_channel = chan;
 
-		if (local->sta_sw_scanning || local->sta_hw_scanning)
+		if (local->sw_scanning || local->hw_scanning)
 			ret = 0;
 		else
 			ret = ieee80211_hw_config(local);
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 97d132811c8..77b68ed8b83 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -552,7 +552,7 @@ static int ieee80211_ioctl_siwscan(struct net_device *dev,
 		ssid_len = req->essid_len;
 	}
 
-	return ieee80211_sta_req_scan(sdata, ssid, ssid_len);
+	return ieee80211_request_scan(sdata, ssid, ssid_len);
 }
 
 
@@ -566,10 +566,10 @@ static int ieee80211_ioctl_giwscan(struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (local->sta_sw_scanning || local->sta_hw_scanning)
+	if (local->sw_scanning || local->hw_scanning)
 		return -EAGAIN;
 
-	res = ieee80211_sta_scan_results(local, info, extra, data->length);
+	res = ieee80211_scan_results(local, info, extra, data->length);
 	if (res >= 0) {
 		data->length = res;
 		return 0;
-- 
cgit v1.2.3


From ccd7b36286f8c42b3fa95c5a8d402162ffab41df Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:56 +0200
Subject: mac80211: clean up some comments

Some comments refer to 80211.o or similar; also remove
a comment about implementing fragments better, we really
have better things to do.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  7 ++-----
 net/mac80211/rx.c          |  4 ++--
 net/mac80211/tx.c          | 15 +--------------
 net/mac80211/wme.h         |  1 -
 4 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 5c38ea08454..1f9336a30e3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -29,9 +29,6 @@
 #include "key.h"
 #include "sta_info.h"
 
-/* ieee80211.o internal definitions, etc. These are not included into
- * low-level drivers. */
-
 struct ieee80211_local;
 
 /* Maximum number of broadcast/multicast frames to buffer when some of the
@@ -293,13 +290,13 @@ struct mesh_config {
 #define IEEE80211_STA_AUTO_BSSID_SEL	BIT(11)
 #define IEEE80211_STA_AUTO_CHANNEL_SEL	BIT(12)
 #define IEEE80211_STA_PRIVACY_INVOKED	BIT(13)
-/* flags for  MLME request*/
+/* flags for MLME request */
 #define IEEE80211_STA_REQ_SCAN 0
 #define IEEE80211_STA_REQ_DIRECT_PROBE 1
 #define IEEE80211_STA_REQ_AUTH 2
 #define IEEE80211_STA_REQ_RUN  3
 
-/* flags used for setting mlme state */
+/* STA/IBSS MLME states */
 enum ieee80211_sta_mlme_state {
 	IEEE80211_STA_MLME_DISABLED,
 	IEEE80211_STA_MLME_DIRECT_PROBE,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 582396a4fdb..33530b29c42 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -501,8 +501,8 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
 	/* Drop disallowed frame classes based on STA auth/assoc state;
 	 * IEEE 802.11, Chap 5.5.
 	 *
-	 * 80211.o does filtering only based on association state, i.e., it
-	 * drops Class 3 frames from not associated stations. hostapd sends
+	 * mac80211 filters only based on association state, i.e. it drops
+	 * Class 3 frames from not associated stations. hostapd sends
 	 * deauth/disassoc frames when needed. In addition, hostapd is
 	 * responsible for filtering on both auth and assoc states.
 	 */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e606ba08ddd..1bed3be01c2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -111,7 +111,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr,
 	hdr = (struct ieee80211_hdr *)tx->skb->data;
 	if (ieee80211_is_ctl(hdr->frame_control)) {
 		/* TODO: These control frames are not currently sent by
-		 * 80211.o, but should they be implemented, this function
+		 * mac80211, but should they be implemented, this function
 		 * needs to be updated to support duration field calculation.
 		 *
 		 * RTS: time needed to transmit pending data/mgmt frame plus
@@ -1580,19 +1580,6 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 	nh_pos -= skip_header_bytes;
 	h_pos -= skip_header_bytes;
 
-	/* TODO: implement support for fragments so that there is no need to
-	 * reallocate and copy payload; it might be enough to support one
-	 * extra fragment that would be copied in the beginning of the frame
-	 * data.. anyway, it would be nice to include this into skb structure
-	 * somehow
-	 *
-	 * There are few options for this:
-	 * use skb->cb as an extra space for 802.11 header
-	 * allocate new buffer if not enough headroom
-	 * make sure that there is enough headroom in every skb by increasing
-	 * build in headroom in __dev_alloc_skb() (linux/skbuff.h) and
-	 * alloc_skb() (net/core/skbuff.c)
-	 */
 	head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb);
 
 	/*
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
index 465e274df7c..bc62f28a4d3 100644
--- a/net/mac80211/wme.h
+++ b/net/mac80211/wme.h
@@ -1,5 +1,4 @@
 /*
- * IEEE 802.11 driver (80211.o) - QoS datatypes
  * Copyright 2004, Instant802 Networks, Inc.
  * Copyright 2005, Devicescape Software, Inc.
  *
-- 
cgit v1.2.3


From 96dd22ac06b0dbfb069fdf530c72046a941e9694 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:57 +0200
Subject: mac80211: inform driver of basic rateset

Drivers need to know the basic rateset to be able to configure
the ACK/CTS programming in hardware correctly.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  7 ++-----
 net/mac80211/iface.c       |  4 +++-
 net/mac80211/mlme.c        | 40 +++++++++-------------------------------
 net/mac80211/tx.c          |  4 ++--
 net/mac80211/util.c        | 28 ++++++++++++++++++++++++++++
 5 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 1f9336a30e3..21cc6d07b02 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -433,11 +433,6 @@ struct ieee80211_sub_if_data {
 
 	int drop_unencrypted;
 
-	/*
-	 * basic rates of this AP or the AP we're associated to
-	 */
-	u64 basic_rates;
-
 	/* Fragment table for host-based reassembly */
 	struct ieee80211_fragment_entry	fragments[IEEE80211_FRAGMENT_MAX];
 	unsigned int fragment_next;
@@ -1017,6 +1012,8 @@ void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 void ieee802_11_parse_elems(u8 *start, size_t len,
 			    struct ieee802_11_elems *elems);
 int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freq);
+u64 ieee80211_mandatory_rates(struct ieee80211_local *local,
+			      enum ieee80211_band band);
 
 #ifdef CONFIG_MAC80211_NOINLINE
 #define debug_noinline noinline
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 61b19340488..dab8eba2602 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -144,7 +144,9 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
 	ieee80211_setup_sdata(sdata, type);
 
 	/* reset some values that shouldn't be kept across type changes */
-	sdata->basic_rates = 0;
+	sdata->bss_conf.basic_rates =
+		ieee80211_mandatory_rates(sdata->local,
+			sdata->local->hw.conf.channel->band);
 	sdata->drop_unencrypted = 0;
 
 	return 0;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 5b748447eb7..55bc6071393 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -710,6 +710,12 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	ieee80211_led_assoc(local, 1);
 
 	sdata->bss_conf.assoc = 1;
+	/*
+	 * For now just always ask the driver to update the basic rateset
+	 * when we have associated, we aren't checking whether it actually
+	 * changed or not.
+	 */
+	changed |= BSS_CHANGED_BASIC_RATES;
 	ieee80211_bss_info_change_notify(sdata, changed);
 
 	netif_tx_start_all_queues(sdata->dev);
@@ -1296,7 +1302,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	}
 
 	sta->supp_rates[local->hw.conf.channel->band] = rates;
-	sdata->basic_rates = basic_rates;
+	sdata->bss_conf.basic_rates = basic_rates;
 
 	/* cf. IEEE 802.11 9.2.12 */
 	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
@@ -1453,34 +1459,6 @@ static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	return res;
 }
 
-static u64 ieee80211_sta_get_mandatory_rates(struct ieee80211_local *local,
-					enum ieee80211_band band)
-{
-	struct ieee80211_supported_band *sband;
-	struct ieee80211_rate *bitrates;
-	u64 mandatory_rates;
-	enum ieee80211_rate_flags mandatory_flag;
-	int i;
-
-	sband = local->hw.wiphy->bands[band];
-	if (!sband) {
-		WARN_ON(1);
-		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-	}
-
-	if (band == IEEE80211_BAND_2GHZ)
-		mandatory_flag = IEEE80211_RATE_MANDATORY_B;
-	else
-		mandatory_flag = IEEE80211_RATE_MANDATORY_A;
-
-	bitrates = sband->bitrates;
-	mandatory_rates = 0;
-	for (i = 0; i < sband->n_bitrates; i++)
-		if (bitrates[i].flags & mandatory_flag)
-			mandatory_rates |= BIT(i);
-	return mandatory_rates;
-}
-
 static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_mgmt *mgmt,
 				  size_t len,
@@ -1522,7 +1500,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 			prev_rates = sta->supp_rates[band];
 			/* make sure mandatory rates are always added */
 			sta->supp_rates[band] = supp_rates |
-				ieee80211_sta_get_mandatory_rates(local, band);
+				ieee80211_mandatory_rates(local, band);
 
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 			if (sta->supp_rates[band] != prev_rates)
@@ -2361,7 +2339,7 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 
 	/* make sure mandatory rates are always added */
 	sta->supp_rates[band] = supp_rates |
-			ieee80211_sta_get_mandatory_rates(local, band);
+			ieee80211_mandatory_rates(local, band);
 
 	rate_control_rate_init(sta, local);
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1bed3be01c2..a523189f10c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -153,7 +153,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr,
 		if (r->bitrate > txrate->bitrate)
 			break;
 
-		if (tx->sdata->basic_rates & BIT(i))
+		if (tx->sdata->bss_conf.basic_rates & BIT(i))
 			rate = r->bitrate;
 
 		switch (sband->band) {
@@ -594,7 +594,7 @@ ieee80211_tx_h_misc(struct ieee80211_tx_data *tx)
 		for (idx = 0; idx < sband->n_bitrates; idx++) {
 			if (sband->bitrates[idx].bitrate > rate->bitrate)
 				continue;
-			if (tx->sdata->basic_rates & BIT(idx) &&
+			if (tx->sdata->bss_conf.basic_rates & BIT(idx) &&
 			    (baserate < 0 ||
 			     (sband->bitrates[baserate].bitrate
 			      < sband->bitrates[idx].bitrate)))
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index a7968df9dac..d6aca91e612 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -640,3 +640,31 @@ int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
 
 	return ret;
 }
+
+u64 ieee80211_mandatory_rates(struct ieee80211_local *local,
+			      enum ieee80211_band band)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *bitrates;
+	u64 mandatory_rates;
+	enum ieee80211_rate_flags mandatory_flag;
+	int i;
+
+	sband = local->hw.wiphy->bands[band];
+	if (!sband) {
+		WARN_ON(1);
+		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	}
+
+	if (band == IEEE80211_BAND_2GHZ)
+		mandatory_flag = IEEE80211_RATE_MANDATORY_B;
+	else
+		mandatory_flag = IEEE80211_RATE_MANDATORY_A;
+
+	bitrates = sband->bitrates;
+	mandatory_rates = 0;
+	for (i = 0; i < sband->n_bitrates; i++)
+		if (bitrates[i].flags & mandatory_flag)
+			mandatory_rates |= BIT(i);
+	return mandatory_rates;
+}
-- 
cgit v1.2.3


From 05c914fe330fa8e1cc67870dc0d3809dfd96c107 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:58 +0200
Subject: mac80211: use nl80211 interface types

There's really no reason for mac80211 to be using its
own interface type defines. Use the nl80211 types and
simplify the configuration code a bit: there's no need
to translate them any more now.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c            | 57 +++++++++++----------------
 net/mac80211/debugfs_netdev.c | 28 +++++++-------
 net/mac80211/ht.c             |  6 +--
 net/mac80211/ieee80211_i.h    |  6 +--
 net/mac80211/iface.c          | 40 ++++++++++---------
 net/mac80211/key.c            |  6 +--
 net/mac80211/main.c           | 89 +++++++++++++++++++++----------------------
 net/mac80211/mlme.c           | 40 +++++++++----------
 net/mac80211/rx.c             | 65 +++++++++++++++----------------
 net/mac80211/scan.c           |  8 ++--
 net/mac80211/sta_info.c       |  4 +-
 net/mac80211/tx.c             | 28 +++++++-------
 net/mac80211/util.c           | 44 +++++++++++----------
 net/mac80211/wext.c           | 84 ++++++++++++++++++++--------------------
 14 files changed, 249 insertions(+), 256 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 6ec2127f9a6..d004351050c 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -24,26 +24,19 @@ struct ieee80211_hw *wiphy_to_hw(struct wiphy *wiphy)
 }
 EXPORT_SYMBOL(wiphy_to_hw);
 
-static enum ieee80211_if_types
-nl80211_type_to_mac80211_type(enum nl80211_iftype type)
+static bool nl80211_type_check(enum nl80211_iftype type)
 {
 	switch (type) {
-	case NL80211_IFTYPE_UNSPECIFIED:
-		return IEEE80211_IF_TYPE_STA;
 	case NL80211_IFTYPE_ADHOC:
-		return IEEE80211_IF_TYPE_IBSS;
 	case NL80211_IFTYPE_STATION:
-		return IEEE80211_IF_TYPE_STA;
 	case NL80211_IFTYPE_MONITOR:
-		return IEEE80211_IF_TYPE_MNTR;
 #ifdef CONFIG_MAC80211_MESH
 	case NL80211_IFTYPE_MESH_POINT:
-		return IEEE80211_IF_TYPE_MESH_POINT;
 #endif
 	case NL80211_IFTYPE_WDS:
-		return IEEE80211_IF_TYPE_WDS;
+		return true;
 	default:
-		return IEEE80211_IF_TYPE_INVALID;
+		return false;
 	}
 }
 
@@ -52,17 +45,15 @@ static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
 			       struct vif_params *params)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
-	enum ieee80211_if_types itype;
 	struct net_device *dev;
 	struct ieee80211_sub_if_data *sdata;
 	int err;
 
-	itype = nl80211_type_to_mac80211_type(type);
-	if (itype == IEEE80211_IF_TYPE_INVALID)
+	if (!nl80211_type_check(type))
 		return -EINVAL;
 
-	err = ieee80211_if_add(local, name, &dev, itype, params);
-	if (err || itype != IEEE80211_IF_TYPE_MNTR || !flags)
+	err = ieee80211_if_add(local, name, &dev, type, params);
+	if (err || type != NL80211_IFTYPE_MONITOR || !flags)
 		return err;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -93,7 +84,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct net_device *dev;
-	enum ieee80211_if_types itype;
 	struct ieee80211_sub_if_data *sdata;
 	int ret;
 
@@ -102,8 +92,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 	if (!dev)
 		return -ENODEV;
 
-	itype = nl80211_type_to_mac80211_type(type);
-	if (itype == IEEE80211_IF_TYPE_INVALID)
+	if (!nl80211_type_check(type))
 		return -EINVAL;
 
 	if (dev == local->mdev)
@@ -111,7 +100,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	ret = ieee80211_if_change_type(sdata, itype);
+	ret = ieee80211_if_change_type(sdata, type);
 	if (ret)
 		return ret;
 
@@ -120,7 +109,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 					    params->mesh_id_len,
 					    params->mesh_id);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_MNTR || !flags)
+	if (sdata->vif.type != NL80211_IFTYPE_MONITOR || !flags)
 		return 0;
 
 	sdata->u.mntr_flags = *flags;
@@ -516,7 +505,7 @@ static int ieee80211_add_beacon(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type != NL80211_IFTYPE_AP)
 		return -EINVAL;
 
 	old = sdata->u.ap.beacon;
@@ -539,7 +528,7 @@ static int ieee80211_set_beacon(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type != NL80211_IFTYPE_AP)
 		return -EINVAL;
 
 	old = sdata->u.ap.beacon;
@@ -561,7 +550,7 @@ static int ieee80211_del_beacon(struct wiphy *wiphy, struct net_device *dev)
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type != NL80211_IFTYPE_AP)
 		return -EINVAL;
 
 	old = sdata->u.ap.beacon;
@@ -716,8 +705,8 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 	if (params->vlan) {
 		sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
 
-		if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN &&
-		    sdata->vif.type != IEEE80211_IF_TYPE_AP)
+		if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+		    sdata->vif.type != NL80211_IFTYPE_AP)
 			return -EINVAL;
 	} else
 		sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -747,8 +736,8 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 		return err;
 	}
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+	    sdata->vif.type == NL80211_IFTYPE_AP)
 		ieee80211_send_layer2_update(sta);
 
 	rcu_read_unlock();
@@ -812,8 +801,8 @@ static int ieee80211_change_station(struct wiphy *wiphy,
 	if (params->vlan && params->vlan != sta->sdata->dev) {
 		vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
 
-		if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN &&
-		    vlansdata->vif.type != IEEE80211_IF_TYPE_AP) {
+		if (vlansdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+		    vlansdata->vif.type != NL80211_IFTYPE_AP) {
 			rcu_read_unlock();
 			return -EINVAL;
 		}
@@ -847,7 +836,7 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT)
+	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
 		return -ENOTSUPP;
 
 	rcu_read_lock();
@@ -903,7 +892,7 @@ static int ieee80211_change_mpath(struct wiphy *wiphy,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT)
+	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
 		return -ENOTSUPP;
 
 	rcu_read_lock();
@@ -978,7 +967,7 @@ static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT)
+	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
 		return -ENOTSUPP;
 
 	rcu_read_lock();
@@ -1006,7 +995,7 @@ static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT)
+	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
 		return -ENOTSUPP;
 
 	rcu_read_lock();
@@ -1035,7 +1024,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type != NL80211_IFTYPE_AP)
 		return -EINVAL;
 
 	if (params->use_cts_prot >= 0) {
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 0fa7681a3d2..1b33cad24ab 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -345,26 +345,26 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
 		return;
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 #ifdef CONFIG_MAC80211_MESH
 		add_mesh_stats(sdata);
 		add_mesh_config(sdata);
 #endif
 		break;
-	case IEEE80211_IF_TYPE_STA:
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
 		add_sta_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_AP:
+	case NL80211_IFTYPE_AP:
 		add_ap_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_WDS:
+	case NL80211_IFTYPE_WDS:
 		add_wds_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_MONITOR:
 		add_monitor_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_AP_VLAN:
 		add_vlan_files(sdata);
 		break;
 	default:
@@ -482,26 +482,26 @@ static void del_files(struct ieee80211_sub_if_data *sdata)
 		return;
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 #ifdef CONFIG_MAC80211_MESH
 		del_mesh_stats(sdata);
 		del_mesh_config(sdata);
 #endif
 		break;
-	case IEEE80211_IF_TYPE_STA:
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
 		del_sta_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_AP:
+	case NL80211_IFTYPE_AP:
 		del_ap_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_WDS:
+	case NL80211_IFTYPE_WDS:
 		del_wds_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_MONITOR:
 		del_monitor_files(sdata);
 		break;
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_AP_VLAN:
 		del_vlan_files(sdata);
 		break;
 	default:
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 4dc35c9dabc..bc3c71ad7ae 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -89,7 +89,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, da, ETH_ALEN);
 	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
 		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
@@ -139,7 +139,7 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, da, ETH_ALEN);
 	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
 		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
@@ -185,7 +185,7 @@ static void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
 	memset(mgmt, 0, 24);
 	memcpy(mgmt->da, da, ETH_ALEN);
 	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
 		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 21cc6d07b02..80d88f5ff90 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -954,10 +954,10 @@ void ieee80211_rx_bss_put(struct ieee80211_local *local,
 /* interface handling */
 void ieee80211_if_setup(struct net_device *dev);
 int ieee80211_if_add(struct ieee80211_local *local, const char *name,
-		     struct net_device **new_dev, enum ieee80211_if_types type,
+		     struct net_device **new_dev, enum nl80211_iftype type,
 		     struct vif_params *params);
 int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
-			     enum ieee80211_if_types type);
+			     enum nl80211_iftype type);
 void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
 void ieee80211_remove_interfaces(struct ieee80211_local *local);
 
@@ -1001,7 +1001,7 @@ extern void *mac80211_wiphy_privid; /* for wiphy privid */
 extern const unsigned char rfc1042_header[6];
 extern const unsigned char bridge_tunnel_header[6];
 u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
-			enum ieee80211_if_types type);
+			enum nl80211_iftype type);
 int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
 			     int rate, int erp, int short_preamble);
 void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index dab8eba2602..004fb23241d 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -41,7 +41,7 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 	sdata->fragment_next = 0;
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_AP:
+	case NL80211_IFTYPE_AP:
 		beacon = sdata->u.ap.beacon;
 		rcu_assign_pointer(sdata->u.ap.beacon, NULL);
 		synchronize_rcu();
@@ -53,22 +53,23 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 		}
 
 		break;
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
 			mesh_rmc_free(sdata);
 		break;
-	case IEEE80211_IF_TYPE_STA:
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
 		kfree(sdata->u.sta.extra_ie);
 		kfree(sdata->u.sta.assocreq_ies);
 		kfree(sdata->u.sta.assocresp_ies);
 		kfree_skb(sdata->u.sta.probe_resp);
 		break;
-	case IEEE80211_IF_TYPE_WDS:
-	case IEEE80211_IF_TYPE_VLAN:
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_WDS:
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_MONITOR:
 		break;
-	case IEEE80211_IF_TYPE_INVALID:
+	case NL80211_IFTYPE_UNSPECIFIED:
+	case __NL80211_IFTYPE_AFTER_LAST:
 		BUG();
 		break;
 	}
@@ -81,7 +82,7 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
  * Helper function to initialise an interface to a specific type.
  */
 static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
-				  enum ieee80211_if_types type)
+				  enum nl80211_iftype type)
 {
 	/* clear type-dependent union */
 	memset(&sdata->u, 0, sizeof(sdata->u));
@@ -93,28 +94,29 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	sdata->dev->type = ARPHRD_ETHER;
 
 	switch (type) {
-	case IEEE80211_IF_TYPE_AP:
+	case NL80211_IFTYPE_AP:
 		skb_queue_head_init(&sdata->u.ap.ps_bc_buf);
 		INIT_LIST_HEAD(&sdata->u.ap.vlans);
 		break;
-	case IEEE80211_IF_TYPE_STA:
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
 		ieee80211_sta_setup_sdata(sdata);
 		break;
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
 			ieee80211_mesh_init_sdata(sdata);
 		break;
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_MONITOR:
 		sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
 		sdata->dev->hard_start_xmit = ieee80211_monitor_start_xmit;
 		sdata->u.mntr_flags = MONITOR_FLAG_CONTROL |
 				      MONITOR_FLAG_OTHER_BSS;
 		break;
-	case IEEE80211_IF_TYPE_WDS:
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_WDS:
+	case NL80211_IFTYPE_AP_VLAN:
 		break;
-	case IEEE80211_IF_TYPE_INVALID:
+	case NL80211_IFTYPE_UNSPECIFIED:
+	case __NL80211_IFTYPE_AFTER_LAST:
 		BUG();
 		break;
 	}
@@ -123,7 +125,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 }
 
 int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
-			     enum ieee80211_if_types type)
+			     enum nl80211_iftype type)
 {
 	ASSERT_RTNL();
 
@@ -153,7 +155,7 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
 }
 
 int ieee80211_if_add(struct ieee80211_local *local, const char *name,
-		     struct net_device **new_dev, enum ieee80211_if_types type,
+		     struct net_device **new_dev, enum nl80211_iftype type,
 		     struct vif_params *params)
 {
 	struct net_device *ndev;
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 6597c779e35..d5b95748db2 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -118,8 +118,8 @@ static const u8 *get_mac_for_key(struct ieee80211_key *key)
 	 * address to indicate a transmit-only key.
 	 */
 	if (key->conf.alg != ALG_WEP &&
-	    (key->sdata->vif.type == IEEE80211_IF_TYPE_AP ||
-	     key->sdata->vif.type == IEEE80211_IF_TYPE_VLAN))
+	    (key->sdata->vif.type == NL80211_IFTYPE_AP ||
+	     key->sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
 		addr = zero_addr;
 
 	if (key->sta)
@@ -331,7 +331,7 @@ void ieee80211_key_link(struct ieee80211_key *key,
 		 */
 		key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
 	} else {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 			struct sta_info *ap;
 
 			/*
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4c424acc01a..584a75bd6cf 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -146,7 +146,7 @@ static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
 	int meshhdrlen;
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	meshhdrlen = (sdata->vif.type == IEEE80211_IF_TYPE_MESH_POINT) ? 5 : 0;
+	meshhdrlen = (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ? 5 : 0;
 
 	/* FIX: what would be proper limits for MTU?
 	 * This interface uses 802.3 frames. */
@@ -164,18 +164,16 @@ static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
 
 static inline int identical_mac_addr_allowed(int type1, int type2)
 {
-	return (type1 == IEEE80211_IF_TYPE_MNTR ||
-		type2 == IEEE80211_IF_TYPE_MNTR ||
-		(type1 == IEEE80211_IF_TYPE_AP &&
-		 type2 == IEEE80211_IF_TYPE_WDS) ||
-		(type1 == IEEE80211_IF_TYPE_WDS &&
-		 (type2 == IEEE80211_IF_TYPE_WDS ||
-		  type2 == IEEE80211_IF_TYPE_AP)) ||
-		(type1 == IEEE80211_IF_TYPE_AP &&
-		 type2 == IEEE80211_IF_TYPE_VLAN) ||
-		(type1 == IEEE80211_IF_TYPE_VLAN &&
-		 (type2 == IEEE80211_IF_TYPE_AP ||
-		  type2 == IEEE80211_IF_TYPE_VLAN)));
+	return type1 == NL80211_IFTYPE_MONITOR ||
+		type2 == NL80211_IFTYPE_MONITOR ||
+		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) ||
+		(type1 == NL80211_IFTYPE_WDS &&
+			(type2 == NL80211_IFTYPE_WDS ||
+			 type2 == NL80211_IFTYPE_AP)) ||
+		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) ||
+		(type1 == NL80211_IFTYPE_AP_VLAN &&
+			(type2 == NL80211_IFTYPE_AP ||
+			 type2 == NL80211_IFTYPE_AP_VLAN));
 }
 
 static int ieee80211_open(struct net_device *dev)
@@ -211,8 +209,8 @@ static int ieee80211_open(struct net_device *dev)
 			 * belonging to the same hardware. Then, however, we're
 			 * faced with having to adopt two different TSF timers...
 			 */
-			if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
-			    nsdata->vif.type == IEEE80211_IF_TYPE_IBSS)
+			if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
+			    nsdata->vif.type == NL80211_IFTYPE_ADHOC)
 				return -EBUSY;
 
 			/*
@@ -232,37 +230,38 @@ static int ieee80211_open(struct net_device *dev)
 			/*
 			 * can only add VLANs to enabled APs
 			 */
-			if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN &&
-			    nsdata->vif.type == IEEE80211_IF_TYPE_AP)
+			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+			    nsdata->vif.type == NL80211_IFTYPE_AP)
 				sdata->bss = &nsdata->u.ap;
 		}
 	}
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_WDS:
+	case NL80211_IFTYPE_WDS:
 		if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
 			return -ENOLINK;
 		break;
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_AP_VLAN:
 		if (!sdata->bss)
 			return -ENOLINK;
 		list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
 		break;
-	case IEEE80211_IF_TYPE_AP:
+	case NL80211_IFTYPE_AP:
 		sdata->bss = &sdata->u.ap;
 		break;
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 		if (!ieee80211_vif_is_mesh(&sdata->vif))
 			break;
 		/* mesh ifaces must set allmulti to forward mcast traffic */
 		atomic_inc(&local->iff_allmultis);
 		break;
-	case IEEE80211_IF_TYPE_STA:
-	case IEEE80211_IF_TYPE_MNTR:
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_MONITOR:
+	case NL80211_IFTYPE_ADHOC:
 		/* no special treatment */
 		break;
-	case IEEE80211_IF_TYPE_INVALID:
+	case NL80211_IFTYPE_UNSPECIFIED:
+	case __NL80211_IFTYPE_AFTER_LAST:
 		/* cannot happen */
 		WARN_ON(1);
 		break;
@@ -309,10 +308,10 @@ static int ieee80211_open(struct net_device *dev)
 	}
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_AP_VLAN:
 		/* no need to tell driver */
 		break;
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_MONITOR:
 		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
 			local->cooked_mntrs++;
 			break;
@@ -336,8 +335,8 @@ static int ieee80211_open(struct net_device *dev)
 		ieee80211_configure_filter(local);
 		netif_addr_unlock_bh(local->mdev);
 		break;
-	case IEEE80211_IF_TYPE_STA:
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
 		sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
 		/* fall through */
 	default:
@@ -354,14 +353,14 @@ static int ieee80211_open(struct net_device *dev)
 		ieee80211_bss_info_change_notify(sdata, changed);
 		ieee80211_enable_keys(sdata);
 
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA &&
+		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
 		    !(sdata->flags & IEEE80211_SDATA_USERSPACE_MLME))
 			netif_carrier_off(dev);
 		else
 			netif_carrier_on(dev);
 	}
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) {
+	if (sdata->vif.type == NL80211_IFTYPE_WDS) {
 		/* Create STA entry for the WDS peer */
 		sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
 				     GFP_KERNEL);
@@ -417,8 +416,8 @@ static int ieee80211_open(struct net_device *dev)
 	 * yet be effective. Trigger execution of ieee80211_sta_work
 	 * to fix this.
 	 */
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 		queue_work(local->hw.workqueue, &ifsta->work);
 	}
@@ -433,7 +432,7 @@ static int ieee80211_open(struct net_device *dev)
 		local->ops->stop(local_to_hw(local));
  err_del_bss:
 	sdata->bss = NULL;
-	if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN)
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		list_del(&sdata->u.vlan.list);
 	return res;
 }
@@ -496,7 +495,7 @@ static int ieee80211_stop(struct net_device *dev)
 	dev_mc_unsync(local->mdev, dev);
 
 	/* APs need special treatment */
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP) {
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
 		struct ieee80211_sub_if_data *vlan, *tmp;
 		struct beacon_data *old_beacon = sdata->u.ap.beacon;
 
@@ -515,11 +514,11 @@ static int ieee80211_stop(struct net_device *dev)
 	local->open_count--;
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_AP_VLAN:
 		list_del(&sdata->u.vlan.list);
 		/* no need to tell driver */
 		break;
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_MONITOR:
 		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
 			local->cooked_mntrs--;
 			break;
@@ -542,8 +541,8 @@ static int ieee80211_stop(struct net_device *dev)
 		ieee80211_configure_filter(local);
 		netif_addr_unlock_bh(local->mdev);
 		break;
-	case IEEE80211_IF_TYPE_STA:
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
 		sdata->u.sta.state = IEEE80211_STA_MLME_DISABLED;
 		memset(sdata->u.sta.bssid, 0, ETH_ALEN);
 		del_timer_sync(&sdata->u.sta.timer);
@@ -569,7 +568,7 @@ static int ieee80211_stop(struct net_device *dev)
 		sdata->u.sta.extra_ie = NULL;
 		sdata->u.sta.extra_ie_len = 0;
 		/* fall through */
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif)) {
 			/* allmulti is always set on mesh ifaces */
 			atomic_dec(&local->iff_allmultis);
@@ -698,12 +697,12 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
 	memset(&conf, 0, sizeof(conf));
 	conf.changed = changed;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		conf.bssid = sdata->u.sta.bssid;
 		conf.ssid = sdata->u.sta.ssid;
 		conf.ssid_len = sdata->u.sta.ssid_len;
-	} else if (sdata->vif.type == IEEE80211_IF_TYPE_AP) {
+	} else if (sdata->vif.type == NL80211_IFTYPE_AP) {
 		conf.bssid = sdata->dev->dev_addr;
 		conf.ssid = sdata->u.ap.ssid;
 		conf.ssid_len = sdata->u.ap.ssid_len;
@@ -1204,7 +1203,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_MNTR) {
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
 			if (!netif_running(sdata->dev))
 				continue;
 
@@ -1450,7 +1449,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	/* add one default STA interface */
 	result = ieee80211_if_add(local, "wlan%d", NULL,
-				  IEEE80211_IF_TYPE_STA, NULL);
+				  NL80211_IFTYPE_STATION, NULL);
 	if (result)
 		printk(KERN_WARNING "%s: Failed to add default virtual iface\n",
 		       wiphy_name(local->hw.wiphy));
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 55bc6071393..8a2cfd3609b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -678,7 +678,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 
 	ifsta->flags |= IEEE80211_STA_ASSOCIATED;
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return;
 
 	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
@@ -1002,17 +1002,17 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	DECLARE_MAC_BUF(mac);
 
 	if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
 		return;
 
 	if (len < 24 + 6)
 		return;
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
+	if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
 	    memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0)
 		return;
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
+	if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
 	    memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
 		return;
 
@@ -1020,7 +1020,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
 	status_code = le16_to_cpu(mgmt->u.auth.status_code);
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		/*
 		 * IEEE 802.11 standard does not require authentication in IBSS
 		 * networks and most implementations do not seem to use it.
@@ -1487,7 +1487,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
 		return;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && elems->supp_rates &&
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
 	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
 		supp_rates = ieee80211_sta_get_rates(local, elems, band);
 
@@ -1532,14 +1532,14 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	 * In STA mode, the remaining parameters should not be overridden
 	 * by beacons because they're not necessarily accurate there.
 	 */
-	if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
+	if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
 	    bss->last_probe_resp && beacon) {
 		ieee80211_rx_bss_put(local, bss);
 		return;
 	}
 
 	/* check if we need to merge IBSS */
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && beacon &&
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC && beacon &&
 	    bss->capability & WLAN_CAPABILITY_IBSS &&
 	    bss->freq == local->oper_channel->center_freq &&
 	    elems->ssid_len == sdata->u.sta.ssid_len &&
@@ -1649,7 +1649,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return;
 	ifsta = &sdata->u.sta;
 
@@ -1700,7 +1700,7 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
 	DECLARE_MAC_BUF(mac3);
 #endif
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS ||
+	if (sdata->vif.type != NL80211_IFTYPE_ADHOC ||
 	    ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED ||
 	    len < 24 + 2 || !ifsta->probe_resp)
 		return;
@@ -2212,8 +2212,8 @@ static void ieee80211_sta_work(struct work_struct *work)
 	if (local->sw_scanning || local->hw_scanning)
 		return;
 
-	if (WARN_ON(sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-		    sdata->vif.type != IEEE80211_IF_TYPE_IBSS))
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION &&
+		    sdata->vif.type != NL80211_IFTYPE_ADHOC))
 		return;
 	ifsta = &sdata->u.sta;
 
@@ -2273,7 +2273,7 @@ static void ieee80211_sta_work(struct work_struct *work)
 
 static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
 {
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type == NL80211_IFTYPE_STATION)
 		queue_work(sdata->local->hw.workqueue,
 			   &sdata->u.sta.work);
 }
@@ -2355,7 +2355,7 @@ void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return;
 
 	if ((ifsta->flags & (IEEE80211_STA_BSSID_SET |
@@ -2407,7 +2407,7 @@ int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size
 	else
 		ifsta->flags &= ~IEEE80211_STA_SSID_SET;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
 	    !(ifsta->flags & IEEE80211_STA_BSSID_SET)) {
 		ifsta->ibss_join_req = jiffies;
 		ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
@@ -2482,8 +2482,8 @@ int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason
 	printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n",
 	       sdata->dev->name, reason);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
 		return -EINVAL;
 
 	ieee80211_set_disassoc(sdata, ifsta, true, true, reason);
@@ -2497,7 +2497,7 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 	printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n",
 	       sdata->dev->name, reason);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return -EINVAL;
 
 	if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED))
@@ -2513,7 +2513,7 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
 	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
 	struct ieee80211_if_sta *ifsta;
 
-	if (sdata && sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		ifsta = &sdata->u.sta;
 		if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) ||
 		    (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) &&
@@ -2539,7 +2539,7 @@ void ieee80211_notify_mac(struct ieee80211_hw *hw,
 	case IEEE80211_NOTIFY_RE_ASSOC:
 		rcu_read_lock();
 		list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-			if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+			if (sdata->vif.type != NL80211_IFTYPE_STATION)
 				continue;
 
 			ieee80211_sta_req_auth(sdata, &sdata->u.sta);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 33530b29c42..8c3dda5f00b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -295,7 +295,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 		if (!netif_running(sdata->dev))
 			continue;
 
-		if (sdata->vif.type != IEEE80211_IF_TYPE_MNTR)
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
 			continue;
 
 		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)
@@ -512,7 +512,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
 
 	if (unlikely((ieee80211_is_data(hdr->frame_control) ||
 		      ieee80211_is_pspoll(hdr->frame_control)) &&
-		     rx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
+		     rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
 		     (!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC)))) {
 		if ((!ieee80211_has_fromds(hdr->frame_control) &&
 		     !ieee80211_has_tods(hdr->frame_control) &&
@@ -724,14 +724,14 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 	/* Update last_rx only for IBSS packets which are for the current
 	 * BSSID to avoid keeping the current IBSS network alive in cases where
 	 * other STAs are using different BSSID. */
-	if (rx->sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
-						IEEE80211_IF_TYPE_IBSS);
+						NL80211_IFTYPE_ADHOC);
 		if (compare_ether_addr(bssid, rx->sdata->u.sta.bssid) == 0)
 			sta->last_rx = jiffies;
 	} else
 	if (!is_multicast_ether_addr(hdr->addr1) ||
-	    rx->sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+	    rx->sdata->vif.type == NL80211_IFTYPE_STATION) {
 		/* Update last_rx only for unicast frames in order to prevent
 		 * the Probe Request frames (the only broadcast frames from a
 		 * STA in infrastructure mode) from keeping a connection alive.
@@ -751,8 +751,8 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 	sta->last_noise = rx->status->noise;
 
 	if (!ieee80211_has_morefrags(hdr->frame_control) &&
-	    (rx->sdata->vif.type == IEEE80211_IF_TYPE_AP ||
-	     rx->sdata->vif.type == IEEE80211_IF_TYPE_VLAN)) {
+	    (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
+	     rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) {
 		/* Change STA power saving mode only in the end of a frame
 		 * exchange sequence */
 		if (test_sta_flags(sta, WLAN_STA_PS) &&
@@ -982,8 +982,8 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
 		   !(rx->flags & IEEE80211_RX_RA_MATCH)))
 		return RX_CONTINUE;
 
-	if ((sdata->vif.type != IEEE80211_IF_TYPE_AP) &&
-	    (sdata->vif.type != IEEE80211_IF_TYPE_VLAN))
+	if ((sdata->vif.type != NL80211_IFTYPE_AP) &&
+	    (sdata->vif.type != NL80211_IFTYPE_AP_VLAN))
 		return RX_DROP_UNUSABLE;
 
 	skb = skb_dequeue(&rx->sta->tx_filtered);
@@ -1131,23 +1131,23 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 	switch (hdr->frame_control &
 		cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
 	case __constant_cpu_to_le16(IEEE80211_FCTL_TODS):
-		if (unlikely(sdata->vif.type != IEEE80211_IF_TYPE_AP &&
-			     sdata->vif.type != IEEE80211_IF_TYPE_VLAN))
+		if (unlikely(sdata->vif.type != NL80211_IFTYPE_AP &&
+			     sdata->vif.type != NL80211_IFTYPE_AP_VLAN))
 			return -1;
 		break;
 	case __constant_cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
-		if (unlikely(sdata->vif.type != IEEE80211_IF_TYPE_WDS &&
-			     sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT))
+		if (unlikely(sdata->vif.type != NL80211_IFTYPE_WDS &&
+			     sdata->vif.type != NL80211_IFTYPE_MESH_POINT))
 			return -1;
 		break;
 	case __constant_cpu_to_le16(IEEE80211_FCTL_FROMDS):
-		if (sdata->vif.type != IEEE80211_IF_TYPE_STA ||
+		if (sdata->vif.type != NL80211_IFTYPE_STATION ||
 		    (is_multicast_ether_addr(dst) &&
 		     !compare_ether_addr(src, dev->dev_addr)))
 			return -1;
 		break;
 	case __constant_cpu_to_le16(0):
-		if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
+		if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
 			return -1;
 		break;
 	}
@@ -1221,8 +1221,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 	skb = rx->skb;
 	xmit_skb = NULL;
 
-	if ((sdata->vif.type == IEEE80211_IF_TYPE_AP ||
-	     sdata->vif.type == IEEE80211_IF_TYPE_VLAN) &&
+	if ((sdata->vif.type == NL80211_IFTYPE_AP ||
+	     sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
 	    !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
 	    (rx->flags & IEEE80211_RX_RA_MATCH)) {
 		if (is_multicast_ether_addr(ehdr->h_dest)) {
@@ -1536,8 +1536,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 	 * FIXME: revisit this, I'm sure we should handle most
 	 *	  of these frames in other modes as well!
 	 */
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
 		return RX_DROP_MONITOR;
 
 	switch (mgmt->u.action.category) {
@@ -1595,8 +1595,8 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 	if (ieee80211_vif_is_mesh(&sdata->vif))
 		return ieee80211_mesh_rx_mgmt(sdata, rx->skb, rx->status);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
 		return RX_DROP_MONITOR;
 
 	if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
@@ -1632,7 +1632,7 @@ static void ieee80211_rx_michael_mic_report(struct net_device *dev,
 	if (!ieee80211_has_protected(hdr->frame_control))
 		goto ignore;
 
-	if (rx->sdata->vif.type == IEEE80211_IF_TYPE_AP && keyidx) {
+	if (rx->sdata->vif.type == NL80211_IFTYPE_AP && keyidx) {
 		/*
 		 * APs with pairwise keys should never receive Michael MIC
 		 * errors for non-zero keyidx because these are reserved for
@@ -1702,7 +1702,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx)
 		if (!netif_running(sdata->dev))
 			continue;
 
-		if (sdata->vif.type != IEEE80211_IF_TYPE_MNTR ||
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
 		    !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
 			continue;
 
@@ -1801,7 +1801,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 	int multicast = is_multicast_ether_addr(hdr->addr1);
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_STA:
+	case NL80211_IFTYPE_STATION:
 		if (!bssid)
 			return 0;
 		if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) {
@@ -1816,7 +1816,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 			rx->flags &= ~IEEE80211_RX_RA_MATCH;
 		}
 		break;
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_ADHOC:
 		if (!bssid)
 			return 0;
 		if (ieee80211_is_beacon(hdr->frame_control)) {
@@ -1837,7 +1837,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 						bssid, hdr->addr2,
 						BIT(rx->status->rate_idx));
 		break;
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 		if (!multicast &&
 		    compare_ether_addr(sdata->dev->dev_addr,
 				       hdr->addr1) != 0) {
@@ -1847,8 +1847,8 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 			rx->flags &= ~IEEE80211_RX_RA_MATCH;
 		}
 		break;
-	case IEEE80211_IF_TYPE_VLAN:
-	case IEEE80211_IF_TYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_AP:
 		if (!bssid) {
 			if (compare_ether_addr(sdata->dev->dev_addr,
 					       hdr->addr1))
@@ -1860,16 +1860,17 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 			rx->flags &= ~IEEE80211_RX_RA_MATCH;
 		}
 		break;
-	case IEEE80211_IF_TYPE_WDS:
+	case NL80211_IFTYPE_WDS:
 		if (bssid || !ieee80211_is_data(hdr->frame_control))
 			return 0;
 		if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2))
 			return 0;
 		break;
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_MONITOR:
 		/* take everything */
 		break;
-	case IEEE80211_IF_TYPE_INVALID:
+	case NL80211_IFTYPE_UNSPECIFIED:
+	case __NL80211_IFTYPE_AFTER_LAST:
 		/* should never get here */
 		WARN_ON(1);
 		break;
@@ -1930,7 +1931,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 		if (!netif_running(sdata->dev))
 			continue;
 
-		if (sdata->vif.type == IEEE80211_IF_TYPE_MNTR)
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR)
 			continue;
 
 		bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 5e719e7b720..8e6685e7ae8 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -475,7 +475,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw)
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		/* Tell AP we're back */
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
 				ieee80211_send_nullfunc(local, sdata, 0);
 				netif_tx_wake_all_queues(sdata->dev);
@@ -539,7 +539,7 @@ void ieee80211_scan_work(struct work_struct *work)
 		chan = &sband->channels[local->scan_channel_idx];
 
 		if (chan->flags & IEEE80211_CHAN_DISABLED ||
-		    (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+		    (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
 		     chan->flags & IEEE80211_CHAN_NO_IBSS))
 			skip = 1;
 
@@ -638,7 +638,7 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
 				netif_tx_stop_all_queues(sdata->dev);
 				ieee80211_send_nullfunc(local, sdata, 1);
@@ -681,7 +681,7 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta;
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return ieee80211_start_scan(sdata, ssid, ssid_len);
 
 	/*
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 3370b262563..31246d8e532 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -319,7 +319,7 @@ int sta_info_insert(struct sta_info *sta)
 
 	/* notify driver */
 	if (local->ops->sta_notify) {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN)
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			sdata = container_of(sdata->bss,
 					     struct ieee80211_sub_if_data,
 					     u.ap);
@@ -456,7 +456,7 @@ static void __sta_info_unlink(struct sta_info **sta)
 	local->num_sta--;
 
 	if (local->ops->sta_notify) {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN)
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			sdata = container_of(sdata->bss,
 					     struct ieee80211_sub_if_data,
 					     u.ap);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a523189f10c..f4bcc589d67 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -226,7 +226,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 	    !ieee80211_is_probe_req(hdr->frame_control))
 		return TX_DROP;
 
-	if (tx->sdata->vif.type == IEEE80211_IF_TYPE_MESH_POINT)
+	if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
 		return TX_CONTINUE;
 
 	if (tx->flags & IEEE80211_TX_PS_BUFFERED)
@@ -236,7 +236,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 
 	if (likely(tx->flags & IEEE80211_TX_UNICAST)) {
 		if (unlikely(!(sta_flags & WLAN_STA_ASSOC) &&
-			     tx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
+			     tx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
 			     ieee80211_is_data(hdr->frame_control))) {
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 			DECLARE_MAC_BUF(mac);
@@ -250,7 +250,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 	} else {
 		if (unlikely(ieee80211_is_data(hdr->frame_control) &&
 			     tx->local->num_sta == 0 &&
-			     tx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS)) {
+			     tx->sdata->vif.type != NL80211_IFTYPE_ADHOC)) {
 			/*
 			 * No associated STAs - no need to send multicast
 			 * frames.
@@ -281,7 +281,7 @@ static void purge_old_ps_buffers(struct ieee80211_local *local)
 
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		struct ieee80211_if_ap *ap;
-		if (sdata->vif.type != IEEE80211_IF_TYPE_AP)
+		if (sdata->vif.type != NL80211_IFTYPE_AP)
 			continue;
 		ap = &sdata->u.ap;
 		skb = skb_dequeue(&ap->ps_bc_buf);
@@ -979,7 +979,7 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 
 	/* process and remove the injection radiotap header */
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (unlikely(sdata->vif.type == IEEE80211_IF_TYPE_MNTR)) {
+	if (unlikely(sdata->vif.type == NL80211_IFTYPE_MONITOR)) {
 		if (__ieee80211_parse_tx_radiotap(tx, skb) == TX_DROP)
 			return TX_DROP;
 
@@ -1457,8 +1457,8 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
 
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_AP:
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
 		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
 		/* DA BSSID SA */
 		memcpy(hdr.addr1, skb->data, ETH_ALEN);
@@ -1466,7 +1466,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 		memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
 		hdrlen = 24;
 		break;
-	case IEEE80211_IF_TYPE_WDS:
+	case NL80211_IFTYPE_WDS:
 		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
 		/* RA TA DA SA */
 		memcpy(hdr.addr1, sdata->u.wds.remote_addr, ETH_ALEN);
@@ -1476,7 +1476,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 		hdrlen = 30;
 		break;
 #ifdef CONFIG_MAC80211_MESH
-	case IEEE80211_IF_TYPE_MESH_POINT:
+	case NL80211_IFTYPE_MESH_POINT:
 		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
 		/* RA TA DA SA */
 		memset(hdr.addr1, 0, ETH_ALEN);
@@ -1493,7 +1493,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 		hdrlen = 30;
 		break;
 #endif
-	case IEEE80211_IF_TYPE_STA:
+	case NL80211_IFTYPE_STATION:
 		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
 		/* BSSID SA DA */
 		memcpy(hdr.addr1, sdata->u.sta.bssid, ETH_ALEN);
@@ -1501,7 +1501,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 		memcpy(hdr.addr3, skb->data, ETH_ALEN);
 		hdrlen = 24;
 		break;
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_ADHOC:
 		/* DA SA BSSID */
 		memcpy(hdr.addr1, skb->data, ETH_ALEN);
 		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
@@ -1812,7 +1812,7 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 	sdata = vif_to_sdata(vif);
 	bdev = sdata->dev;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP) {
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
 		ap = &sdata->u.ap;
 		beacon = rcu_dereference(ap->beacon);
 		if (ap && beacon) {
@@ -1854,7 +1854,7 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 			num_beacons = &ap->num_beacons;
 		} else
 			goto out;
-	} else if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		struct ieee80211_hdr *hdr;
 		ifsta = &sdata->u.sta;
 
@@ -1999,7 +1999,7 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw,
 	rcu_read_lock();
 	beacon = rcu_dereference(bss->beacon);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_AP || !beacon || !beacon->head)
+	if (sdata->vif.type != NL80211_IFTYPE_AP || !beacon || !beacon->head)
 		goto out;
 
 	if (bss->dtim_count != 0)
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index d6aca91e612..6eb222369bc 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -43,7 +43,7 @@ const unsigned char bridge_tunnel_header[] __aligned(2) =
 
 
 u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
-			enum ieee80211_if_types type)
+			enum nl80211_iftype type)
 {
 	__le16 fc = hdr->frame_control;
 
@@ -77,10 +77,10 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
 
 		if (ieee80211_is_back_req(fc)) {
 			switch (type) {
-			case IEEE80211_IF_TYPE_STA:
+			case NL80211_IFTYPE_STATION:
 				return hdr->addr2;
-			case IEEE80211_IF_TYPE_AP:
-			case IEEE80211_IF_TYPE_VLAN:
+			case NL80211_IFTYPE_AP:
+			case NL80211_IFTYPE_AP_VLAN:
 				return hdr->addr1;
 			default:
 				break; /* fall through to the return */
@@ -376,15 +376,16 @@ void ieee80211_iterate_active_interfaces(
 
 	list_for_each_entry(sdata, &local->interfaces, list) {
 		switch (sdata->vif.type) {
-		case IEEE80211_IF_TYPE_INVALID:
-		case IEEE80211_IF_TYPE_MNTR:
-		case IEEE80211_IF_TYPE_VLAN:
+		case __NL80211_IFTYPE_AFTER_LAST:
+		case NL80211_IFTYPE_UNSPECIFIED:
+		case NL80211_IFTYPE_MONITOR:
+		case NL80211_IFTYPE_AP_VLAN:
 			continue;
-		case IEEE80211_IF_TYPE_AP:
-		case IEEE80211_IF_TYPE_STA:
-		case IEEE80211_IF_TYPE_IBSS:
-		case IEEE80211_IF_TYPE_WDS:
-		case IEEE80211_IF_TYPE_MESH_POINT:
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_ADHOC:
+		case NL80211_IFTYPE_WDS:
+		case NL80211_IFTYPE_MESH_POINT:
 			break;
 		}
 		if (netif_running(sdata->dev))
@@ -409,15 +410,16 @@ void ieee80211_iterate_active_interfaces_atomic(
 
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		switch (sdata->vif.type) {
-		case IEEE80211_IF_TYPE_INVALID:
-		case IEEE80211_IF_TYPE_MNTR:
-		case IEEE80211_IF_TYPE_VLAN:
+		case __NL80211_IFTYPE_AFTER_LAST:
+		case NL80211_IFTYPE_UNSPECIFIED:
+		case NL80211_IFTYPE_MONITOR:
+		case NL80211_IFTYPE_AP_VLAN:
 			continue;
-		case IEEE80211_IF_TYPE_AP:
-		case IEEE80211_IF_TYPE_STA:
-		case IEEE80211_IF_TYPE_IBSS:
-		case IEEE80211_IF_TYPE_WDS:
-		case IEEE80211_IF_TYPE_MESH_POINT:
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_ADHOC:
+		case NL80211_IFTYPE_WDS:
+		case NL80211_IFTYPE_MESH_POINT:
 			break;
 		}
 		if (netif_running(sdata->dev))
@@ -622,7 +624,7 @@ int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
 	chan = ieee80211_get_channel(local->hw.wiphy, freqMHz);
 
 	if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) {
-		if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+		if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
 		    chan->flags & IEEE80211_CHAN_NO_IBSS) {
 			printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
 				"%d MHz\n", sdata->dev->name, chan->center_freq);
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 77b68ed8b83..aef9707700f 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -122,8 +122,8 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev,
 	if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
 		return -EOPNOTSUPP;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length);
 		if (ret)
 			return ret;
@@ -273,21 +273,21 @@ static int ieee80211_ioctl_siwmode(struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	int type;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN)
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		return -EOPNOTSUPP;
 
 	switch (*mode) {
 	case IW_MODE_INFRA:
-		type = IEEE80211_IF_TYPE_STA;
+		type = NL80211_IFTYPE_STATION;
 		break;
 	case IW_MODE_ADHOC:
-		type = IEEE80211_IF_TYPE_IBSS;
+		type = NL80211_IFTYPE_ADHOC;
 		break;
 	case IW_MODE_REPEAT:
-		type = IEEE80211_IF_TYPE_WDS;
+		type = NL80211_IFTYPE_WDS;
 		break;
 	case IW_MODE_MONITOR:
-		type = IEEE80211_IF_TYPE_MNTR;
+		type = NL80211_IFTYPE_MONITOR;
 		break;
 	default:
 		return -EINVAL;
@@ -305,22 +305,22 @@ static int ieee80211_ioctl_giwmode(struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	switch (sdata->vif.type) {
-	case IEEE80211_IF_TYPE_AP:
+	case NL80211_IFTYPE_AP:
 		*mode = IW_MODE_MASTER;
 		break;
-	case IEEE80211_IF_TYPE_STA:
+	case NL80211_IFTYPE_STATION:
 		*mode = IW_MODE_INFRA;
 		break;
-	case IEEE80211_IF_TYPE_IBSS:
+	case NL80211_IFTYPE_ADHOC:
 		*mode = IW_MODE_ADHOC;
 		break;
-	case IEEE80211_IF_TYPE_MNTR:
+	case NL80211_IFTYPE_MONITOR:
 		*mode = IW_MODE_MONITOR;
 		break;
-	case IEEE80211_IF_TYPE_WDS:
+	case NL80211_IFTYPE_WDS:
 		*mode = IW_MODE_REPEAT;
 		break;
-	case IEEE80211_IF_TYPE_VLAN:
+	case NL80211_IFTYPE_AP_VLAN:
 		*mode = IW_MODE_SECOND;		/* FIXME */
 		break;
 	default:
@@ -336,13 +336,13 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev,
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type == NL80211_IFTYPE_STATION)
 		sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL;
 
 	/* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */
 	if (freq->e == 0) {
 		if (freq->m < 0) {
-			if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
+			if (sdata->vif.type == NL80211_IFTYPE_STATION)
 				sdata->u.sta.flags |=
 					IEEE80211_STA_AUTO_CHANNEL_SEL;
 			return 0;
@@ -386,8 +386,8 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
 		len--;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		int ret;
 		if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
 			if (len > IEEE80211_MAX_SSID_LEN)
@@ -407,7 +407,7 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
 		return 0;
 	}
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP) {
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
 		memcpy(sdata->u.ap.ssid, ssid, len);
 		memset(sdata->u.ap.ssid + len, 0,
 		       IEEE80211_MAX_SSID_LEN - len);
@@ -426,8 +426,8 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
 
 	struct ieee80211_sub_if_data *sdata;
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		int res = ieee80211_sta_get_ssid(sdata, ssid, &len);
 		if (res == 0) {
 			data->length = len;
@@ -437,7 +437,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
 		return res;
 	}
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_AP) {
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
 		len = sdata->u.ap.ssid_len;
 		if (len > IW_ESSID_MAX_SIZE)
 			len = IW_ESSID_MAX_SIZE;
@@ -457,8 +457,8 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		int ret;
 		if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
 			memcpy(sdata->u.sta.bssid, (u8 *) &ap_addr->sa_data,
@@ -477,7 +477,7 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
 			return ret;
 		ieee80211_sta_req_auth(sdata, &sdata->u.sta);
 		return 0;
-	} else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) {
+	} else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
 		/*
 		 * If it is necessary to update the WDS peer address
 		 * while the interface is running, then we need to do
@@ -505,8 +505,8 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATED ||
 		    sdata->u.sta.state == IEEE80211_STA_MLME_IBSS_JOINED) {
 			ap_addr->sa_family = ARPHRD_ETHER;
@@ -516,7 +516,7 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
 			memset(&ap_addr->sa_data, 0, ETH_ALEN);
 			return 0;
 		}
-	} else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) {
+	} else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
 		ap_addr->sa_family = ARPHRD_ETHER;
 		memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN);
 		return 0;
@@ -538,10 +538,10 @@ static int ieee80211_ioctl_siwscan(struct net_device *dev,
 	if (!netif_running(dev))
 		return -ENETDOWN;
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_MESH_POINT &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_AP)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+	    sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
+	    sdata->vif.type != NL80211_IFTYPE_AP)
 		return -EOPNOTSUPP;
 
 	/* if SSID was specified explicitly then use that */
@@ -627,7 +627,7 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return -EOPNOTSUPP;
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
@@ -858,8 +858,8 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev,
 	struct iw_mlme *mlme = (struct iw_mlme *) extra;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type != IEEE80211_IF_TYPE_STA &&
-	    sdata->vif.type != IEEE80211_IF_TYPE_IBSS)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
 		return -EINVAL;
 
 	switch (mlme->cmd) {
@@ -954,7 +954,7 @@ static int ieee80211_ioctl_giwencode(struct net_device *dev,
 	erq->length = sdata->keys[idx]->conf.keylen;
 	erq->flags |= IW_ENCODE_ENABLED;
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 		switch (ifsta->auth_alg) {
 		case WLAN_AUTH_OPEN:
@@ -1028,7 +1028,7 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
 		sdata->drop_unencrypted = !!data->value;
 		break;
 	case IW_AUTH_PRIVACY_INVOKED:
-		if (sdata->vif.type != IEEE80211_IF_TYPE_STA)
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
 			ret = -EINVAL;
 		else {
 			sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
@@ -1043,8 +1043,8 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
 		}
 		break;
 	case IW_AUTH_80211_AUTH_ALG:
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-		    sdata->vif.type == IEEE80211_IF_TYPE_IBSS)
+		if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+		    sdata->vif.type == NL80211_IFTYPE_ADHOC)
 			sdata->u.sta.auth_algs = data->value;
 		else
 			ret = -EOPNOTSUPP;
@@ -1066,8 +1066,8 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev
 
 	rcu_read_lock();
 
-	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS)
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC)
 		sta = sta_info_get(local, sdata->u.sta.bssid);
 	if (!sta) {
 		wstats->discard.fragment = 0;
@@ -1097,8 +1097,8 @@ static int ieee80211_ioctl_giwauth(struct net_device *dev,
 
 	switch (data->flags & IW_AUTH_INDEX) {
 	case IW_AUTH_80211_AUTH_ALG:
-		if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
-		    sdata->vif.type == IEEE80211_IF_TYPE_IBSS)
+		if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+		    sdata->vif.type == NL80211_IFTYPE_ADHOC)
 			data->value = sdata->u.sta.auth_algs;
 		else
 			ret = -EOPNOTSUPP;
-- 
cgit v1.2.3


From 0d143fe1e2efc084fa730d2dfa22d0d1ca2ee5f1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:01:59 +0200
Subject: mac80211: move regular interface handling

Move the code to handle regular interfaces out of main.c and
into iface.c, keep only the master interface stuff in main.c.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |   2 +-
 net/mac80211/iface.c       | 534 ++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/main.c        | 565 ++-------------------------------------------
 3 files changed, 550 insertions(+), 551 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 80d88f5ff90..6bd6a6306da 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -891,6 +891,7 @@ u32 ieee80211_handle_ht(struct ieee80211_local *local, int enable_ht,
 			struct ieee80211_ht_bss_info *req_bss_cap);
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u32 changed);
+void ieee80211_configure_filter(struct ieee80211_local *local);
 
 /* wireless extensions */
 extern const struct iw_handler_def ieee80211_iw_handler_def;
@@ -952,7 +953,6 @@ void ieee80211_rx_bss_put(struct ieee80211_local *local,
 			  struct ieee80211_bss *bss);
 
 /* interface handling */
-void ieee80211_if_setup(struct net_device *dev);
 int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 		     struct net_device **new_dev, enum nl80211_iftype type,
 		     struct vif_params *params);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 004fb23241d..f528962b13e 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1,4 +1,6 @@
 /*
+ * Interface handling (except master interface)
+ *
  * Copyright 2002-2005, Instant802 Networks, Inc.
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
@@ -17,7 +19,539 @@
 #include "sta_info.h"
 #include "debugfs_netdev.h"
 #include "mesh.h"
+#include "led.h"
+
+static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
+{
+	int meshhdrlen;
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	meshhdrlen = (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ? 5 : 0;
+
+	/* FIX: what would be proper limits for MTU?
+	 * This interface uses 802.3 frames. */
+	if (new_mtu < 256 ||
+	    new_mtu > IEEE80211_MAX_DATA_LEN - 24 - 6 - meshhdrlen) {
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	printk(KERN_DEBUG "%s: setting MTU %d\n", dev->name, new_mtu);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static inline int identical_mac_addr_allowed(int type1, int type2)
+{
+	return type1 == NL80211_IFTYPE_MONITOR ||
+		type2 == NL80211_IFTYPE_MONITOR ||
+		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) ||
+		(type1 == NL80211_IFTYPE_WDS &&
+			(type2 == NL80211_IFTYPE_WDS ||
+			 type2 == NL80211_IFTYPE_AP)) ||
+		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) ||
+		(type1 == NL80211_IFTYPE_AP_VLAN &&
+			(type2 == NL80211_IFTYPE_AP ||
+			 type2 == NL80211_IFTYPE_AP_VLAN));
+}
+
+static int ieee80211_open(struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata, *nsdata;
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct sta_info *sta;
+	struct ieee80211_if_init_conf conf;
+	u32 changed = 0;
+	int res;
+	bool need_hw_reconfig = 0;
+	u8 null_addr[ETH_ALEN] = {0};
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	/* fail early if user set an invalid address */
+	if (compare_ether_addr(dev->dev_addr, null_addr) &&
+	    !is_valid_ether_addr(dev->dev_addr))
+		return -EADDRNOTAVAIL;
+
+	/* we hold the RTNL here so can safely walk the list */
+	list_for_each_entry(nsdata, &local->interfaces, list) {
+		struct net_device *ndev = nsdata->dev;
+
+		if (ndev != dev && netif_running(ndev)) {
+			/*
+			 * Allow only a single IBSS interface to be up at any
+			 * time. This is restricted because beacon distribution
+			 * cannot work properly if both are in the same IBSS.
+			 *
+			 * To remove this restriction we'd have to disallow them
+			 * from setting the same SSID on different IBSS interfaces
+			 * belonging to the same hardware. Then, however, we're
+			 * faced with having to adopt two different TSF timers...
+			 */
+			if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
+			    nsdata->vif.type == NL80211_IFTYPE_ADHOC)
+				return -EBUSY;
+
+			/*
+			 * The remaining checks are only performed for interfaces
+			 * with the same MAC address.
+			 */
+			if (compare_ether_addr(dev->dev_addr, ndev->dev_addr))
+				continue;
+
+			/*
+			 * check whether it may have the same address
+			 */
+			if (!identical_mac_addr_allowed(sdata->vif.type,
+							nsdata->vif.type))
+				return -ENOTUNIQ;
+
+			/*
+			 * can only add VLANs to enabled APs
+			 */
+			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+			    nsdata->vif.type == NL80211_IFTYPE_AP)
+				sdata->bss = &nsdata->u.ap;
+		}
+	}
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_WDS:
+		if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
+			return -ENOLINK;
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		if (!sdata->bss)
+			return -ENOLINK;
+		list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
+		break;
+	case NL80211_IFTYPE_AP:
+		sdata->bss = &sdata->u.ap;
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		if (!ieee80211_vif_is_mesh(&sdata->vif))
+			break;
+		/* mesh ifaces must set allmulti to forward mcast traffic */
+		atomic_inc(&local->iff_allmultis);
+		break;
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_MONITOR:
+	case NL80211_IFTYPE_ADHOC:
+		/* no special treatment */
+		break;
+	case NL80211_IFTYPE_UNSPECIFIED:
+	case __NL80211_IFTYPE_AFTER_LAST:
+		/* cannot happen */
+		WARN_ON(1);
+		break;
+	}
+
+	if (local->open_count == 0) {
+		res = 0;
+		if (local->ops->start)
+			res = local->ops->start(local_to_hw(local));
+		if (res)
+			goto err_del_bss;
+		need_hw_reconfig = 1;
+		ieee80211_led_radio(local, local->hw.conf.radio_enabled);
+	}
 
+	/*
+	 * Check all interfaces and copy the hopefully now-present
+	 * MAC address to those that have the special null one.
+	 */
+	list_for_each_entry(nsdata, &local->interfaces, list) {
+		struct net_device *ndev = nsdata->dev;
+
+		/*
+		 * No need to check netif_running since we do not allow
+		 * it to start up with this invalid address.
+		 */
+		if (compare_ether_addr(null_addr, ndev->dev_addr) == 0)
+			memcpy(ndev->dev_addr,
+			       local->hw.wiphy->perm_addr,
+			       ETH_ALEN);
+	}
+
+	if (compare_ether_addr(null_addr, local->mdev->dev_addr) == 0)
+		memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr,
+		       ETH_ALEN);
+
+	/*
+	 * Validate the MAC address for this device.
+	 */
+	if (!is_valid_ether_addr(dev->dev_addr)) {
+		if (!local->open_count && local->ops->stop)
+			local->ops->stop(local_to_hw(local));
+		return -EADDRNOTAVAIL;
+	}
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		/* no need to tell driver */
+		break;
+	case NL80211_IFTYPE_MONITOR:
+		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+			local->cooked_mntrs++;
+			break;
+		}
+
+		/* must be before the call to ieee80211_configure_filter */
+		local->monitors++;
+		if (local->monitors == 1)
+			local->hw.conf.flags |= IEEE80211_CONF_RADIOTAP;
+
+		if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
+			local->fif_fcsfail++;
+		if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
+			local->fif_plcpfail++;
+		if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL)
+			local->fif_control++;
+		if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
+			local->fif_other_bss++;
+
+		netif_addr_lock_bh(local->mdev);
+		ieee80211_configure_filter(local);
+		netif_addr_unlock_bh(local->mdev);
+		break;
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+		sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+		/* fall through */
+	default:
+		conf.vif = &sdata->vif;
+		conf.type = sdata->vif.type;
+		conf.mac_addr = dev->dev_addr;
+		res = local->ops->add_interface(local_to_hw(local), &conf);
+		if (res)
+			goto err_stop;
+
+		if (ieee80211_vif_is_mesh(&sdata->vif))
+			ieee80211_start_mesh(sdata);
+		changed |= ieee80211_reset_erp_info(sdata);
+		ieee80211_bss_info_change_notify(sdata, changed);
+		ieee80211_enable_keys(sdata);
+
+		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+		    !(sdata->flags & IEEE80211_SDATA_USERSPACE_MLME))
+			netif_carrier_off(dev);
+		else
+			netif_carrier_on(dev);
+	}
+
+	if (sdata->vif.type == NL80211_IFTYPE_WDS) {
+		/* Create STA entry for the WDS peer */
+		sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
+				     GFP_KERNEL);
+		if (!sta) {
+			res = -ENOMEM;
+			goto err_del_interface;
+		}
+
+		/* no locking required since STA is not live yet */
+		sta->flags |= WLAN_STA_AUTHORIZED;
+
+		res = sta_info_insert(sta);
+		if (res) {
+			/* STA has been freed */
+			goto err_del_interface;
+		}
+	}
+
+	if (local->open_count == 0) {
+		res = dev_open(local->mdev);
+		WARN_ON(res);
+		if (res)
+			goto err_del_interface;
+		tasklet_enable(&local->tx_pending_tasklet);
+		tasklet_enable(&local->tasklet);
+	}
+
+	/*
+	 * set_multicast_list will be invoked by the networking core
+	 * which will check whether any increments here were done in
+	 * error and sync them down to the hardware as filter flags.
+	 */
+	if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+		atomic_inc(&local->iff_allmultis);
+
+	if (sdata->flags & IEEE80211_SDATA_PROMISC)
+		atomic_inc(&local->iff_promiscs);
+
+	local->open_count++;
+	if (need_hw_reconfig) {
+		ieee80211_hw_config(local);
+		/*
+		 * set default queue parameters so drivers don't
+		 * need to initialise the hardware if the hardware
+		 * doesn't start up with sane defaults
+		 */
+		ieee80211_set_wmm_default(sdata);
+	}
+
+	/*
+	 * ieee80211_sta_work is disabled while network interface
+	 * is down. Therefore, some configuration changes may not
+	 * yet be effective. Trigger execution of ieee80211_sta_work
+	 * to fix this.
+	 */
+	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+		queue_work(local->hw.workqueue, &ifsta->work);
+	}
+
+	netif_tx_start_all_queues(dev);
+
+	return 0;
+ err_del_interface:
+	local->ops->remove_interface(local_to_hw(local), &conf);
+ err_stop:
+	if (!local->open_count && local->ops->stop)
+		local->ops->stop(local_to_hw(local));
+ err_del_bss:
+	sdata->bss = NULL;
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		list_del(&sdata->u.vlan.list);
+	return res;
+}
+
+static int ieee80211_stop(struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_init_conf conf;
+	struct sta_info *sta;
+
+	/*
+	 * Stop TX on this interface first.
+	 */
+	netif_tx_stop_all_queues(dev);
+
+	/*
+	 * Now delete all active aggregation sessions.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sta->sdata == sdata)
+			ieee80211_sta_tear_down_BA_sessions(sdata, sta->addr);
+	}
+
+	rcu_read_unlock();
+
+	/*
+	 * Remove all stations associated with this interface.
+	 *
+	 * This must be done before calling ops->remove_interface()
+	 * because otherwise we can later invoke ops->sta_notify()
+	 * whenever the STAs are removed, and that invalidates driver
+	 * assumptions about always getting a vif pointer that is valid
+	 * (because if we remove a STA after ops->remove_interface()
+	 * the driver will have removed the vif info already!)
+	 *
+	 * We could relax this and only unlink the stations from the
+	 * hash table and list but keep them on a per-sdata list that
+	 * will be inserted back again when the interface is brought
+	 * up again, but I don't currently see a use case for that,
+	 * except with WDS which gets a STA entry created when it is
+	 * brought up.
+	 */
+	sta_info_flush(local, sdata);
+
+	/*
+	 * Don't count this interface for promisc/allmulti while it
+	 * is down. dev_mc_unsync() will invoke set_multicast_list
+	 * on the master interface which will sync these down to the
+	 * hardware as filter flags.
+	 */
+	if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+		atomic_dec(&local->iff_allmultis);
+
+	if (sdata->flags & IEEE80211_SDATA_PROMISC)
+		atomic_dec(&local->iff_promiscs);
+
+	dev_mc_unsync(local->mdev, dev);
+
+	/* APs need special treatment */
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
+		struct ieee80211_sub_if_data *vlan, *tmp;
+		struct beacon_data *old_beacon = sdata->u.ap.beacon;
+
+		/* remove beacon */
+		rcu_assign_pointer(sdata->u.ap.beacon, NULL);
+		synchronize_rcu();
+		kfree(old_beacon);
+
+		/* down all dependent devices, that is VLANs */
+		list_for_each_entry_safe(vlan, tmp, &sdata->u.ap.vlans,
+					 u.vlan.list)
+			dev_close(vlan->dev);
+		WARN_ON(!list_empty(&sdata->u.ap.vlans));
+	}
+
+	local->open_count--;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		list_del(&sdata->u.vlan.list);
+		/* no need to tell driver */
+		break;
+	case NL80211_IFTYPE_MONITOR:
+		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+			local->cooked_mntrs--;
+			break;
+		}
+
+		local->monitors--;
+		if (local->monitors == 0)
+			local->hw.conf.flags &= ~IEEE80211_CONF_RADIOTAP;
+
+		if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
+			local->fif_fcsfail--;
+		if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
+			local->fif_plcpfail--;
+		if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL)
+			local->fif_control--;
+		if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
+			local->fif_other_bss--;
+
+		netif_addr_lock_bh(local->mdev);
+		ieee80211_configure_filter(local);
+		netif_addr_unlock_bh(local->mdev);
+		break;
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+		sdata->u.sta.state = IEEE80211_STA_MLME_DISABLED;
+		memset(sdata->u.sta.bssid, 0, ETH_ALEN);
+		del_timer_sync(&sdata->u.sta.timer);
+		/*
+		 * If the timer fired while we waited for it, it will have
+		 * requeued the work. Now the work will be running again
+		 * but will not rearm the timer again because it checks
+		 * whether the interface is running, which, at this point,
+		 * it no longer is.
+		 */
+		cancel_work_sync(&sdata->u.sta.work);
+		/*
+		 * When we get here, the interface is marked down.
+		 * Call synchronize_rcu() to wait for the RX path
+		 * should it be using the interface and enqueuing
+		 * frames at this very time on another CPU.
+		 */
+		synchronize_rcu();
+		skb_queue_purge(&sdata->u.sta.skb_queue);
+
+		sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
+		kfree(sdata->u.sta.extra_ie);
+		sdata->u.sta.extra_ie = NULL;
+		sdata->u.sta.extra_ie_len = 0;
+		/* fall through */
+	case NL80211_IFTYPE_MESH_POINT:
+		if (ieee80211_vif_is_mesh(&sdata->vif)) {
+			/* allmulti is always set on mesh ifaces */
+			atomic_dec(&local->iff_allmultis);
+			ieee80211_stop_mesh(sdata);
+		}
+		/* fall through */
+	default:
+		if (local->scan_sdata == sdata) {
+			if (!local->ops->hw_scan)
+				cancel_delayed_work_sync(&local->scan_work);
+			/*
+			 * The software scan can no longer run now, so we can
+			 * clear out the scan_sdata reference. However, the
+			 * hardware scan may still be running. The complete
+			 * function must be prepared to handle a NULL value.
+			 */
+			local->scan_sdata = NULL;
+			/*
+			 * The memory barrier guarantees that another CPU
+			 * that is hardware-scanning will now see the fact
+			 * that this interface is gone.
+			 */
+			smp_mb();
+			/*
+			 * If software scanning, complete the scan but since
+			 * the scan_sdata is NULL already don't send out a
+			 * scan event to userspace -- the scan is incomplete.
+			 */
+			if (local->sw_scanning)
+				ieee80211_scan_completed(&local->hw);
+		}
+
+		conf.vif = &sdata->vif;
+		conf.type = sdata->vif.type;
+		conf.mac_addr = dev->dev_addr;
+		/* disable all keys for as long as this netdev is down */
+		ieee80211_disable_keys(sdata);
+		local->ops->remove_interface(local_to_hw(local), &conf);
+	}
+
+	sdata->bss = NULL;
+
+	if (local->open_count == 0) {
+		if (netif_running(local->mdev))
+			dev_close(local->mdev);
+
+		if (local->ops->stop)
+			local->ops->stop(local_to_hw(local));
+
+		ieee80211_led_radio(local, 0);
+
+		flush_workqueue(local->hw.workqueue);
+
+		tasklet_disable(&local->tx_pending_tasklet);
+		tasklet_disable(&local->tasklet);
+	}
+
+	return 0;
+}
+
+static void ieee80211_set_multicast_list(struct net_device *dev)
+{
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	int allmulti, promisc, sdata_allmulti, sdata_promisc;
+
+	allmulti = !!(dev->flags & IFF_ALLMULTI);
+	promisc = !!(dev->flags & IFF_PROMISC);
+	sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI);
+	sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC);
+
+	if (allmulti != sdata_allmulti) {
+		if (dev->flags & IFF_ALLMULTI)
+			atomic_inc(&local->iff_allmultis);
+		else
+			atomic_dec(&local->iff_allmultis);
+		sdata->flags ^= IEEE80211_SDATA_ALLMULTI;
+	}
+
+	if (promisc != sdata_promisc) {
+		if (dev->flags & IFF_PROMISC)
+			atomic_inc(&local->iff_promiscs);
+		else
+			atomic_dec(&local->iff_promiscs);
+		sdata->flags ^= IEEE80211_SDATA_PROMISC;
+	}
+
+	dev_mc_sync(local->mdev, dev);
+}
+
+static void ieee80211_if_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->hard_start_xmit = ieee80211_subif_start_xmit;
+	dev->wireless_handlers = &ieee80211_iw_handler_def;
+	dev->set_multicast_list = ieee80211_set_multicast_list;
+	dev->change_mtu = ieee80211_change_mtu;
+	dev->open = ieee80211_open;
+	dev->stop = ieee80211_stop;
+	dev->destructor = free_netdev;
+	/* we will validate the address ourselves in ->open */
+	dev->validate_addr = NULL;
+}
 /*
  * Called when the netdev is removed or, by the code below, before
  * the interface type changes.
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 584a75bd6cf..c532043c1a1 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -45,16 +45,9 @@ struct ieee80211_tx_status_rtap_hdr {
 	u8 data_retries;
 } __attribute__ ((packed));
 
-/* common interface routines */
-
-static int header_parse_80211(const struct sk_buff *skb, unsigned char *haddr)
-{
-	memcpy(haddr, skb_mac_header(skb) + 10, ETH_ALEN); /* addr2 */
-	return ETH_ALEN;
-}
 
 /* must be called under mdev tx lock */
-static void ieee80211_configure_filter(struct ieee80211_local *local)
+void ieee80211_configure_filter(struct ieee80211_local *local)
 {
 	unsigned int changed_flags;
 	unsigned int new_flags = 0;
@@ -97,6 +90,20 @@ static void ieee80211_configure_filter(struct ieee80211_local *local)
 
 /* master interface */
 
+static int header_parse_80211(const struct sk_buff *skb, unsigned char *haddr)
+{
+	memcpy(haddr, skb_mac_header(skb) + 10, ETH_ALEN); /* addr2 */
+	return ETH_ALEN;
+}
+
+static const struct header_ops ieee80211_header_ops = {
+	.create		= eth_header,
+	.parse		= header_parse_80211,
+	.rebuild	= eth_rebuild_header,
+	.cache		= eth_header_cache,
+	.cache_update	= eth_header_cache_update,
+};
+
 static int ieee80211_master_open(struct net_device *dev)
 {
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
@@ -139,548 +146,6 @@ static void ieee80211_master_set_multicast_list(struct net_device *dev)
 	ieee80211_configure_filter(local);
 }
 
-/* regular interfaces */
-
-static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
-{
-	int meshhdrlen;
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
-	meshhdrlen = (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ? 5 : 0;
-
-	/* FIX: what would be proper limits for MTU?
-	 * This interface uses 802.3 frames. */
-	if (new_mtu < 256 ||
-	    new_mtu > IEEE80211_MAX_DATA_LEN - 24 - 6 - meshhdrlen) {
-		return -EINVAL;
-	}
-
-#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: setting MTU %d\n", dev->name, new_mtu);
-#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
-	dev->mtu = new_mtu;
-	return 0;
-}
-
-static inline int identical_mac_addr_allowed(int type1, int type2)
-{
-	return type1 == NL80211_IFTYPE_MONITOR ||
-		type2 == NL80211_IFTYPE_MONITOR ||
-		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) ||
-		(type1 == NL80211_IFTYPE_WDS &&
-			(type2 == NL80211_IFTYPE_WDS ||
-			 type2 == NL80211_IFTYPE_AP)) ||
-		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) ||
-		(type1 == NL80211_IFTYPE_AP_VLAN &&
-			(type2 == NL80211_IFTYPE_AP ||
-			 type2 == NL80211_IFTYPE_AP_VLAN));
-}
-
-static int ieee80211_open(struct net_device *dev)
-{
-	struct ieee80211_sub_if_data *sdata, *nsdata;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct sta_info *sta;
-	struct ieee80211_if_init_conf conf;
-	u32 changed = 0;
-	int res;
-	bool need_hw_reconfig = 0;
-	u8 null_addr[ETH_ALEN] = {0};
-
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
-	/* fail early if user set an invalid address */
-	if (compare_ether_addr(dev->dev_addr, null_addr) &&
-	    !is_valid_ether_addr(dev->dev_addr))
-		return -EADDRNOTAVAIL;
-
-	/* we hold the RTNL here so can safely walk the list */
-	list_for_each_entry(nsdata, &local->interfaces, list) {
-		struct net_device *ndev = nsdata->dev;
-
-		if (ndev != dev && netif_running(ndev)) {
-			/*
-			 * Allow only a single IBSS interface to be up at any
-			 * time. This is restricted because beacon distribution
-			 * cannot work properly if both are in the same IBSS.
-			 *
-			 * To remove this restriction we'd have to disallow them
-			 * from setting the same SSID on different IBSS interfaces
-			 * belonging to the same hardware. Then, however, we're
-			 * faced with having to adopt two different TSF timers...
-			 */
-			if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
-			    nsdata->vif.type == NL80211_IFTYPE_ADHOC)
-				return -EBUSY;
-
-			/*
-			 * The remaining checks are only performed for interfaces
-			 * with the same MAC address.
-			 */
-			if (compare_ether_addr(dev->dev_addr, ndev->dev_addr))
-				continue;
-
-			/*
-			 * check whether it may have the same address
-			 */
-			if (!identical_mac_addr_allowed(sdata->vif.type,
-							nsdata->vif.type))
-				return -ENOTUNIQ;
-
-			/*
-			 * can only add VLANs to enabled APs
-			 */
-			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
-			    nsdata->vif.type == NL80211_IFTYPE_AP)
-				sdata->bss = &nsdata->u.ap;
-		}
-	}
-
-	switch (sdata->vif.type) {
-	case NL80211_IFTYPE_WDS:
-		if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
-			return -ENOLINK;
-		break;
-	case NL80211_IFTYPE_AP_VLAN:
-		if (!sdata->bss)
-			return -ENOLINK;
-		list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
-		break;
-	case NL80211_IFTYPE_AP:
-		sdata->bss = &sdata->u.ap;
-		break;
-	case NL80211_IFTYPE_MESH_POINT:
-		if (!ieee80211_vif_is_mesh(&sdata->vif))
-			break;
-		/* mesh ifaces must set allmulti to forward mcast traffic */
-		atomic_inc(&local->iff_allmultis);
-		break;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_MONITOR:
-	case NL80211_IFTYPE_ADHOC:
-		/* no special treatment */
-		break;
-	case NL80211_IFTYPE_UNSPECIFIED:
-	case __NL80211_IFTYPE_AFTER_LAST:
-		/* cannot happen */
-		WARN_ON(1);
-		break;
-	}
-
-	if (local->open_count == 0) {
-		res = 0;
-		if (local->ops->start)
-			res = local->ops->start(local_to_hw(local));
-		if (res)
-			goto err_del_bss;
-		need_hw_reconfig = 1;
-		ieee80211_led_radio(local, local->hw.conf.radio_enabled);
-	}
-
-	/*
-	 * Check all interfaces and copy the hopefully now-present
-	 * MAC address to those that have the special null one.
-	 */
-	list_for_each_entry(nsdata, &local->interfaces, list) {
-		struct net_device *ndev = nsdata->dev;
-
-		/*
-		 * No need to check netif_running since we do not allow
-		 * it to start up with this invalid address.
-		 */
-		if (compare_ether_addr(null_addr, ndev->dev_addr) == 0)
-			memcpy(ndev->dev_addr,
-			       local->hw.wiphy->perm_addr,
-			       ETH_ALEN);
-	}
-
-	if (compare_ether_addr(null_addr, local->mdev->dev_addr) == 0)
-		memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr,
-		       ETH_ALEN);
-
-	/*
-	 * Validate the MAC address for this device.
-	 */
-	if (!is_valid_ether_addr(dev->dev_addr)) {
-		if (!local->open_count && local->ops->stop)
-			local->ops->stop(local_to_hw(local));
-		return -EADDRNOTAVAIL;
-	}
-
-	switch (sdata->vif.type) {
-	case NL80211_IFTYPE_AP_VLAN:
-		/* no need to tell driver */
-		break;
-	case NL80211_IFTYPE_MONITOR:
-		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
-			local->cooked_mntrs++;
-			break;
-		}
-
-		/* must be before the call to ieee80211_configure_filter */
-		local->monitors++;
-		if (local->monitors == 1)
-			local->hw.conf.flags |= IEEE80211_CONF_RADIOTAP;
-
-		if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
-			local->fif_fcsfail++;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
-			local->fif_plcpfail++;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL)
-			local->fif_control++;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
-			local->fif_other_bss++;
-
-		netif_addr_lock_bh(local->mdev);
-		ieee80211_configure_filter(local);
-		netif_addr_unlock_bh(local->mdev);
-		break;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_ADHOC:
-		sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
-		/* fall through */
-	default:
-		conf.vif = &sdata->vif;
-		conf.type = sdata->vif.type;
-		conf.mac_addr = dev->dev_addr;
-		res = local->ops->add_interface(local_to_hw(local), &conf);
-		if (res)
-			goto err_stop;
-
-		if (ieee80211_vif_is_mesh(&sdata->vif))
-			ieee80211_start_mesh(sdata);
-		changed |= ieee80211_reset_erp_info(sdata);
-		ieee80211_bss_info_change_notify(sdata, changed);
-		ieee80211_enable_keys(sdata);
-
-		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
-		    !(sdata->flags & IEEE80211_SDATA_USERSPACE_MLME))
-			netif_carrier_off(dev);
-		else
-			netif_carrier_on(dev);
-	}
-
-	if (sdata->vif.type == NL80211_IFTYPE_WDS) {
-		/* Create STA entry for the WDS peer */
-		sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
-				     GFP_KERNEL);
-		if (!sta) {
-			res = -ENOMEM;
-			goto err_del_interface;
-		}
-
-		/* no locking required since STA is not live yet */
-		sta->flags |= WLAN_STA_AUTHORIZED;
-
-		res = sta_info_insert(sta);
-		if (res) {
-			/* STA has been freed */
-			goto err_del_interface;
-		}
-	}
-
-	if (local->open_count == 0) {
-		res = dev_open(local->mdev);
-		WARN_ON(res);
-		if (res)
-			goto err_del_interface;
-		tasklet_enable(&local->tx_pending_tasklet);
-		tasklet_enable(&local->tasklet);
-	}
-
-	/*
-	 * set_multicast_list will be invoked by the networking core
-	 * which will check whether any increments here were done in
-	 * error and sync them down to the hardware as filter flags.
-	 */
-	if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
-		atomic_inc(&local->iff_allmultis);
-
-	if (sdata->flags & IEEE80211_SDATA_PROMISC)
-		atomic_inc(&local->iff_promiscs);
-
-	local->open_count++;
-	if (need_hw_reconfig) {
-		ieee80211_hw_config(local);
-		/*
-		 * set default queue parameters so drivers don't
-		 * need to initialise the hardware if the hardware
-		 * doesn't start up with sane defaults
-		 */
-		ieee80211_set_wmm_default(sdata);
-	}
-
-	/*
-	 * ieee80211_sta_work is disabled while network interface
-	 * is down. Therefore, some configuration changes may not
-	 * yet be effective. Trigger execution of ieee80211_sta_work
-	 * to fix this.
-	 */
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-		queue_work(local->hw.workqueue, &ifsta->work);
-	}
-
-	netif_tx_start_all_queues(dev);
-
-	return 0;
- err_del_interface:
-	local->ops->remove_interface(local_to_hw(local), &conf);
- err_stop:
-	if (!local->open_count && local->ops->stop)
-		local->ops->stop(local_to_hw(local));
- err_del_bss:
-	sdata->bss = NULL;
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		list_del(&sdata->u.vlan.list);
-	return res;
-}
-
-static int ieee80211_stop(struct net_device *dev)
-{
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_init_conf conf;
-	struct sta_info *sta;
-
-	/*
-	 * Stop TX on this interface first.
-	 */
-	netif_tx_stop_all_queues(dev);
-
-	/*
-	 * Now delete all active aggregation sessions.
-	 */
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(sta, &local->sta_list, list) {
-		if (sta->sdata == sdata)
-			ieee80211_sta_tear_down_BA_sessions(sdata, sta->addr);
-	}
-
-	rcu_read_unlock();
-
-	/*
-	 * Remove all stations associated with this interface.
-	 *
-	 * This must be done before calling ops->remove_interface()
-	 * because otherwise we can later invoke ops->sta_notify()
-	 * whenever the STAs are removed, and that invalidates driver
-	 * assumptions about always getting a vif pointer that is valid
-	 * (because if we remove a STA after ops->remove_interface()
-	 * the driver will have removed the vif info already!)
-	 *
-	 * We could relax this and only unlink the stations from the
-	 * hash table and list but keep them on a per-sdata list that
-	 * will be inserted back again when the interface is brought
-	 * up again, but I don't currently see a use case for that,
-	 * except with WDS which gets a STA entry created when it is
-	 * brought up.
-	 */
-	sta_info_flush(local, sdata);
-
-	/*
-	 * Don't count this interface for promisc/allmulti while it
-	 * is down. dev_mc_unsync() will invoke set_multicast_list
-	 * on the master interface which will sync these down to the
-	 * hardware as filter flags.
-	 */
-	if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
-		atomic_dec(&local->iff_allmultis);
-
-	if (sdata->flags & IEEE80211_SDATA_PROMISC)
-		atomic_dec(&local->iff_promiscs);
-
-	dev_mc_unsync(local->mdev, dev);
-
-	/* APs need special treatment */
-	if (sdata->vif.type == NL80211_IFTYPE_AP) {
-		struct ieee80211_sub_if_data *vlan, *tmp;
-		struct beacon_data *old_beacon = sdata->u.ap.beacon;
-
-		/* remove beacon */
-		rcu_assign_pointer(sdata->u.ap.beacon, NULL);
-		synchronize_rcu();
-		kfree(old_beacon);
-
-		/* down all dependent devices, that is VLANs */
-		list_for_each_entry_safe(vlan, tmp, &sdata->u.ap.vlans,
-					 u.vlan.list)
-			dev_close(vlan->dev);
-		WARN_ON(!list_empty(&sdata->u.ap.vlans));
-	}
-
-	local->open_count--;
-
-	switch (sdata->vif.type) {
-	case NL80211_IFTYPE_AP_VLAN:
-		list_del(&sdata->u.vlan.list);
-		/* no need to tell driver */
-		break;
-	case NL80211_IFTYPE_MONITOR:
-		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
-			local->cooked_mntrs--;
-			break;
-		}
-
-		local->monitors--;
-		if (local->monitors == 0)
-			local->hw.conf.flags &= ~IEEE80211_CONF_RADIOTAP;
-
-		if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
-			local->fif_fcsfail--;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
-			local->fif_plcpfail--;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL)
-			local->fif_control--;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
-			local->fif_other_bss--;
-
-		netif_addr_lock_bh(local->mdev);
-		ieee80211_configure_filter(local);
-		netif_addr_unlock_bh(local->mdev);
-		break;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_ADHOC:
-		sdata->u.sta.state = IEEE80211_STA_MLME_DISABLED;
-		memset(sdata->u.sta.bssid, 0, ETH_ALEN);
-		del_timer_sync(&sdata->u.sta.timer);
-		/*
-		 * If the timer fired while we waited for it, it will have
-		 * requeued the work. Now the work will be running again
-		 * but will not rearm the timer again because it checks
-		 * whether the interface is running, which, at this point,
-		 * it no longer is.
-		 */
-		cancel_work_sync(&sdata->u.sta.work);
-		/*
-		 * When we get here, the interface is marked down.
-		 * Call synchronize_rcu() to wait for the RX path
-		 * should it be using the interface and enqueuing
-		 * frames at this very time on another CPU.
-		 */
-		synchronize_rcu();
-		skb_queue_purge(&sdata->u.sta.skb_queue);
-
-		sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
-		kfree(sdata->u.sta.extra_ie);
-		sdata->u.sta.extra_ie = NULL;
-		sdata->u.sta.extra_ie_len = 0;
-		/* fall through */
-	case NL80211_IFTYPE_MESH_POINT:
-		if (ieee80211_vif_is_mesh(&sdata->vif)) {
-			/* allmulti is always set on mesh ifaces */
-			atomic_dec(&local->iff_allmultis);
-			ieee80211_stop_mesh(sdata);
-		}
-		/* fall through */
-	default:
-		if (local->scan_sdata == sdata) {
-			if (!local->ops->hw_scan)
-				cancel_delayed_work_sync(&local->scan_work);
-			/*
-			 * The software scan can no longer run now, so we can
-			 * clear out the scan_sdata reference. However, the
-			 * hardware scan may still be running. The complete
-			 * function must be prepared to handle a NULL value.
-			 */
-			local->scan_sdata = NULL;
-			/*
-			 * The memory barrier guarantees that another CPU
-			 * that is hardware-scanning will now see the fact
-			 * that this interface is gone.
-			 */
-			smp_mb();
-			/*
-			 * If software scanning, complete the scan but since
-			 * the scan_sdata is NULL already don't send out a
-			 * scan event to userspace -- the scan is incomplete.
-			 */
-			if (local->sw_scanning)
-				ieee80211_scan_completed(&local->hw);
-		}
-
-		conf.vif = &sdata->vif;
-		conf.type = sdata->vif.type;
-		conf.mac_addr = dev->dev_addr;
-		/* disable all keys for as long as this netdev is down */
-		ieee80211_disable_keys(sdata);
-		local->ops->remove_interface(local_to_hw(local), &conf);
-	}
-
-	sdata->bss = NULL;
-
-	if (local->open_count == 0) {
-		if (netif_running(local->mdev))
-			dev_close(local->mdev);
-
-		if (local->ops->stop)
-			local->ops->stop(local_to_hw(local));
-
-		ieee80211_led_radio(local, 0);
-
-		flush_workqueue(local->hw.workqueue);
-
-		tasklet_disable(&local->tx_pending_tasklet);
-		tasklet_disable(&local->tasklet);
-	}
-
-	return 0;
-}
-
-static void ieee80211_set_multicast_list(struct net_device *dev)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	int allmulti, promisc, sdata_allmulti, sdata_promisc;
-
-	allmulti = !!(dev->flags & IFF_ALLMULTI);
-	promisc = !!(dev->flags & IFF_PROMISC);
-	sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI);
-	sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC);
-
-	if (allmulti != sdata_allmulti) {
-		if (dev->flags & IFF_ALLMULTI)
-			atomic_inc(&local->iff_allmultis);
-		else
-			atomic_dec(&local->iff_allmultis);
-		sdata->flags ^= IEEE80211_SDATA_ALLMULTI;
-	}
-
-	if (promisc != sdata_promisc) {
-		if (dev->flags & IFF_PROMISC)
-			atomic_inc(&local->iff_promiscs);
-		else
-			atomic_dec(&local->iff_promiscs);
-		sdata->flags ^= IEEE80211_SDATA_PROMISC;
-	}
-
-	dev_mc_sync(local->mdev, dev);
-}
-
-static const struct header_ops ieee80211_header_ops = {
-	.create		= eth_header,
-	.parse		= header_parse_80211,
-	.rebuild	= eth_rebuild_header,
-	.cache		= eth_header_cache,
-	.cache_update	= eth_header_cache_update,
-};
-
-void ieee80211_if_setup(struct net_device *dev)
-{
-	ether_setup(dev);
-	dev->hard_start_xmit = ieee80211_subif_start_xmit;
-	dev->wireless_handlers = &ieee80211_iw_handler_def;
-	dev->set_multicast_list = ieee80211_set_multicast_list;
-	dev->change_mtu = ieee80211_change_mtu;
-	dev->open = ieee80211_open;
-	dev->stop = ieee80211_stop;
-	dev->destructor = free_netdev;
-	/* we will validate the address ourselves in ->open */
-	dev->validate_addr = NULL;
-}
-
 /* everything else */
 
 int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
-- 
cgit v1.2.3


From 7a725f73403e874ec52c58741e9b98cd604dbd03 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:02:00 +0200
Subject: mac80211: warn on some invalid vlan operations

These should never happen, but better warn about them than
crashing a driver, the fact that they never happen is rather
subtle throughout mac80211.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index c532043c1a1..dd838b725af 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -156,6 +156,9 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
 	if (WARN_ON(!netif_running(sdata->dev)))
 		return 0;
 
+	if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
+		return -EINVAL;
+
 	if (!local->ops->config_interface)
 		return 0;
 
@@ -321,6 +324,9 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 
+	if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
+		return;
+
 	if (!changed)
 		return;
 
-- 
cgit v1.2.3


From 17741cdc264e4d768167766a252210e201c1519a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 00:02:02 +0200
Subject: mac80211: share STA information with driver

This patch changes mac80211 to share some more data about
stations with drivers. Should help iwlwifi and ath9k when
 they get around to updating, and might also help with
implementing rate control algorithms without internals.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Sujith Manoharan <Sujith.Manoharan@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c         | 12 +++++-----
 net/mac80211/debugfs_key.c |  3 ++-
 net/mac80211/debugfs_sta.c |  6 ++---
 net/mac80211/ht.c          | 22 ++++++++---------
 net/mac80211/iface.c       |  3 ++-
 net/mac80211/key.c         |  2 +-
 net/mac80211/mesh_hwmp.c   |  8 +++----
 net/mac80211/mesh_plink.c  | 44 +++++++++++++++++-----------------
 net/mac80211/mlme.c        |  5 ++--
 net/mac80211/rx.c          | 12 +++++-----
 net/mac80211/sta_info.c    | 59 ++++++++++++++++++++++++++++------------------
 net/mac80211/sta_info.h    |  7 ++++--
 net/mac80211/tkip.c        |  2 +-
 net/mac80211/tx.c          | 10 ++++----
 net/mac80211/wme.c         |  2 +-
 net/mac80211/wpa.c         |  2 +-
 16 files changed, 109 insertions(+), 90 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d004351050c..ed5e77ce627 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -364,7 +364,7 @@ static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
 	sta = sta_info_get_by_idx(local, idx, dev);
 	if (sta) {
 		ret = 0;
-		memcpy(mac, sta->addr, ETH_ALEN);
+		memcpy(mac, sta->sta.addr, ETH_ALEN);
 		sta_set_sinfo(sta, sinfo);
 	}
 
@@ -593,7 +593,7 @@ static void ieee80211_send_layer2_update(struct sta_info *sta)
 	 * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
 
 	memset(msg->da, 0xff, ETH_ALEN);
-	memcpy(msg->sa, sta->addr, ETH_ALEN);
+	memcpy(msg->sa, sta->sta.addr, ETH_ALEN);
 	msg->len = htons(6);
 	msg->dsap = 0;
 	msg->ssap = 0x01;	/* NULL LSAP, CR Bit: Response */
@@ -648,9 +648,9 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 	 */
 
 	if (params->aid) {
-		sta->aid = params->aid;
-		if (sta->aid > IEEE80211_MAX_AID)
-			sta->aid = 0; /* XXX: should this be an error? */
+		sta->sta.aid = params->aid;
+		if (sta->sta.aid > IEEE80211_MAX_AID)
+			sta->sta.aid = 0; /* XXX: should this be an error? */
 	}
 
 	if (params->listen_interval >= 0)
@@ -919,7 +919,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 			    struct mpath_info *pinfo)
 {
 	if (mpath->next_hop)
-		memcpy(next_hop, mpath->next_hop->addr, ETH_ALEN);
+		memcpy(next_hop, mpath->next_hop->sta.addr, ETH_ALEN);
 	else
 		memset(next_hop, 0, ETH_ALEN);
 
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index cf82acec913..a3294d10932 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -206,7 +206,8 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key)
 	rcu_read_lock();
 	sta = rcu_dereference(key->sta);
 	if (sta)
-		sprintf(buf, "../../stations/%s", print_mac(mac, sta->addr));
+		sprintf(buf, "../../stations/%s",
+			print_mac(mac, sta->sta.addr));
 	rcu_read_unlock();
 
 	/* using sta as a boolean is fine outside RCU lock */
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 6abe5427752..81f350eaf8a 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -50,7 +50,7 @@ static const struct file_operations sta_ ##name## _ops = {		\
 		STA_READ_##format(name, field)				\
 		STA_OPS(name)
 
-STA_FILE(aid, aid, D);
+STA_FILE(aid, sta.aid, D);
 STA_FILE(dev, sdata->dev->name, S);
 STA_FILE(rx_packets, rx_packets, LU);
 STA_FILE(tx_packets, tx_packets, LU);
@@ -176,7 +176,7 @@ static ssize_t sta_agg_status_write(struct file *file,
 	struct net_device *dev = sta->sdata->dev;
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_hw *hw = &local->hw;
-	u8 *da = sta->addr;
+	u8 *da = sta->sta.addr;
 	static int tid_static_tx[16] = {0, 0, 0, 0, 0, 0, 0, 0,
 					0, 0, 0, 0, 0, 0, 0, 0};
 	static int tid_static_rx[16] = {1, 1, 1, 1, 1, 1, 1, 1,
@@ -253,7 +253,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	if (!stations_dir)
 		return;
 
-	mac = print_mac(mbuf, sta->addr);
+	mac = print_mac(mbuf, sta->sta.addr);
 
 	sta->debugfs.dir = debugfs_create_dir(mac, stations_dir);
 	if (!sta->debugfs.dir)
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index bc3c71ad7ae..dc7d9a3d70d 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -274,7 +274,7 @@ void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *r
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
 	ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_STOP,
-					ra, tid, NULL);
+				       &sta->sta, tid, NULL);
 	if (ret)
 		printk(KERN_DEBUG "HW problem - can not stop rx "
 				"aggregation for tid %d\n", tid);
@@ -328,7 +328,7 @@ static void sta_addba_resp_timer_expired(unsigned long data)
 
 	rcu_read_lock();
 
-	sta = sta_info_get(local, temp_sta->addr);
+	sta = sta_info_get(local, temp_sta->sta.addr);
 	if (!sta) {
 		rcu_read_unlock();
 		return;
@@ -354,7 +354,7 @@ static void sta_addba_resp_timer_expired(unsigned long data)
 	/* go through the state check in stop_BA_session */
 	*state = HT_AGG_STATE_OPERATIONAL;
 	spin_unlock_bh(&sta->lock);
-	ieee80211_stop_tx_ba_session(hw, temp_sta->addr, tid,
+	ieee80211_stop_tx_ba_session(hw, temp_sta->sta.addr, tid,
 				     WLAN_BACK_INITIATOR);
 
 timer_expired_exit:
@@ -465,7 +465,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 
 	if (local->ops->ampdu_action)
 		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START,
-						ra, tid, &start_seq_num);
+					       &sta->sta, tid, &start_seq_num);
 
 	if (ret) {
 		/* No need to requeue the packets in the agg queue, since we
@@ -557,7 +557,7 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw,
 
 	if (local->ops->ampdu_action)
 		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_STOP,
-						ra, tid, NULL);
+					       &sta->sta, tid, NULL);
 
 	/* case HW denied going back to legacy */
 	if (ret) {
@@ -767,7 +767,7 @@ static void sta_rx_agg_session_timer_expired(unsigned long data)
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
 #endif
-	ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
+	ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->sta.addr,
 					 (u16)*ptid, WLAN_BACK_TIMER,
 					 WLAN_REASON_QSTA_TIMEOUT);
 }
@@ -874,7 +874,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 
 	if (local->ops->ampdu_action)
 		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START,
-					       sta->addr, tid, &start_seq_num);
+					       &sta->sta, tid, &start_seq_num);
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "Rx A-MPDU request on tid %d result %d\n", tid, ret);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
@@ -899,7 +899,7 @@ end:
 	spin_unlock_bh(&sta->lock);
 
 end_no_lock:
-	ieee80211_send_addba_resp(sta->sdata, sta->addr, tid,
+	ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
 				  dialog_token, status, 1, buf_size, timeout);
 }
 
@@ -952,7 +952,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 		/* this will allow the state check in stop_BA_session */
 		*state = HT_AGG_STATE_OPERATIONAL;
 		spin_unlock_bh(&sta->lock);
-		ieee80211_stop_tx_ba_session(hw, sta->addr, tid,
+		ieee80211_stop_tx_ba_session(hw, sta->sta.addr, tid,
 					     WLAN_BACK_INITIATOR);
 	}
 }
@@ -979,14 +979,14 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
 	if (initiator == WLAN_BACK_INITIATOR)
-		ieee80211_sta_stop_rx_ba_session(sdata, sta->addr, tid,
+		ieee80211_sta_stop_rx_ba_session(sdata, sta->sta.addr, tid,
 						 WLAN_BACK_INITIATOR, 0);
 	else { /* WLAN_BACK_RECIPIENT */
 		spin_lock_bh(&sta->lock);
 		sta->ampdu_mlme.tid_state_tx[tid] =
 				HT_AGG_STATE_OPERATIONAL;
 		spin_unlock_bh(&sta->lock);
-		ieee80211_stop_tx_ba_session(&local->hw, sta->addr, tid,
+		ieee80211_stop_tx_ba_session(&local->hw, sta->sta.addr, tid,
 					     WLAN_BACK_RECIPIENT);
 	}
 }
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index f528962b13e..a7ef0289fbd 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -336,7 +336,8 @@ static int ieee80211_stop(struct net_device *dev)
 
 	list_for_each_entry_rcu(sta, &local->sta_list, list) {
 		if (sta->sdata == sdata)
-			ieee80211_sta_tear_down_BA_sessions(sdata, sta->addr);
+			ieee80211_sta_tear_down_BA_sessions(sdata,
+							    sta->sta.addr);
 	}
 
 	rcu_read_unlock();
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index d5b95748db2..57afcd38cd9 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -123,7 +123,7 @@ static const u8 *get_mac_for_key(struct ieee80211_key *key)
 		addr = zero_addr;
 
 	if (key->sta)
-		addr = key->sta->addr;
+		addr = key->sta->sta.addr;
 
 	return addr;
 }
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 1fad792ad25..15a5c99270a 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -517,7 +517,7 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 		spin_unlock_bh(&mpath->state_lock);
 		goto fail;
 	}
-	memcpy(next_hop, mpath->next_hop->addr, ETH_ALEN);
+	memcpy(next_hop, mpath->next_hop->sta.addr, ETH_ALEN);
 	spin_unlock_bh(&mpath->state_lock);
 	--ttl;
 	flags = PREP_IE_FLAGS(prep_elem);
@@ -529,7 +529,7 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 
 	mesh_path_sel_frame_tx(MPATH_PREP, flags, orig_addr,
 		cpu_to_le32(orig_dsn), 0, dst_addr,
-		cpu_to_le32(dst_dsn), mpath->next_hop->addr, hopcount, ttl,
+		cpu_to_le32(dst_dsn), mpath->next_hop->sta.addr, hopcount, ttl,
 		cpu_to_le32(lifetime), cpu_to_le32(metric),
 		0, sdata);
 	rcu_read_unlock();
@@ -557,7 +557,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
 	if (mpath) {
 		spin_lock_bh(&mpath->state_lock);
 		if (mpath->flags & MESH_PATH_ACTIVE &&
-		    memcmp(ta, mpath->next_hop->addr, ETH_ALEN) == 0 &&
+		    memcmp(ta, mpath->next_hop->sta.addr, ETH_ALEN) == 0 &&
 		    (!(mpath->flags & MESH_PATH_DSN_VALID) ||
 		    DSN_GT(dst_dsn, mpath->dsn))) {
 			mpath->flags &= ~MESH_PATH_ACTIVE;
@@ -799,7 +799,7 @@ int mesh_nexthop_lookup(struct sk_buff *skb,
 			mesh_queue_preq(mpath,
 					PREQ_Q_F_START | PREQ_Q_F_REFRESH);
 		}
-		memcpy(hdr->addr1, mpath->next_hop->addr,
+		memcpy(hdr->addr1, mpath->next_hop->sta.addr,
 				ETH_ALEN);
 	} else {
 		if (!(mpath->flags & MESH_PATH_RESOLVING)) {
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 990a4b7f6bc..debf7834dbc 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -275,7 +275,7 @@ static void mesh_plink_timer(unsigned long data)
 		return;
 	}
 	mpl_dbg("Mesh plink timer for %s fired on state %d\n",
-			print_mac(mac, sta->addr), sta->plink_state);
+			print_mac(mac, sta->sta.addr), sta->plink_state);
 	reason = 0;
 	llid = sta->llid;
 	plid = sta->plid;
@@ -288,7 +288,7 @@ static void mesh_plink_timer(unsigned long data)
 		if (sta->plink_retries < dot11MeshMaxRetries(sdata)) {
 			u32 rand;
 			mpl_dbg("Mesh plink for %s (retry, timeout): %d %d\n",
-					print_mac(mac, sta->addr),
+					print_mac(mac, sta->sta.addr),
 					sta->plink_retries, sta->plink_timeout);
 			get_random_bytes(&rand, sizeof(u32));
 			sta->plink_timeout = sta->plink_timeout +
@@ -296,7 +296,7 @@ static void mesh_plink_timer(unsigned long data)
 			++sta->plink_retries;
 			mod_plink_timer(sta, sta->plink_timeout);
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->sta.addr, llid,
 					    0, 0);
 			break;
 		}
@@ -309,7 +309,7 @@ static void mesh_plink_timer(unsigned long data)
 		sta->plink_state = PLINK_HOLDING;
 		mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata));
 		spin_unlock_bh(&sta->lock);
-		mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid, plid,
+		mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid, plid,
 				    reason);
 		break;
 	case PLINK_HOLDING:
@@ -352,10 +352,10 @@ int mesh_plink_open(struct sta_info *sta)
 	mesh_plink_timer_set(sta, dot11MeshRetryTimeout(sdata));
 	spin_unlock_bh(&sta->lock);
 	mpl_dbg("Mesh plink: starting establishment with %s\n",
-		print_mac(mac, sta->addr));
+		print_mac(mac, sta->sta.addr));
 
 	return mesh_plink_frame_tx(sdata, PLINK_OPEN,
-				   sta->addr, llid, 0, 0);
+				   sta->sta.addr, llid, 0, 0);
 }
 
 void mesh_plink_block(struct sta_info *sta)
@@ -379,7 +379,7 @@ int mesh_plink_close(struct sta_info *sta)
 #endif
 
 	mpl_dbg("Mesh plink: closing link with %s\n",
-			print_mac(mac, sta->addr));
+			print_mac(mac, sta->sta.addr));
 	spin_lock_bh(&sta->lock);
 	sta->reason = cpu_to_le16(MESH_LINK_CANCELLED);
 	reason = sta->reason;
@@ -400,7 +400,7 @@ int mesh_plink_close(struct sta_info *sta)
 	llid = sta->llid;
 	plid = sta->plid;
 	spin_unlock_bh(&sta->lock);
-	mesh_plink_frame_tx(sta->sdata, PLINK_CLOSE, sta->addr, llid,
+	mesh_plink_frame_tx(sta->sdata, PLINK_CLOSE, sta->sta.addr, llid,
 			    plid, reason);
 	return 0;
 }
@@ -577,9 +577,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			sta->llid = llid;
 			mesh_plink_timer_set(sta, dot11MeshRetryTimeout(sdata));
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->sta.addr, llid,
 					    0, 0);
-			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr,
 					    llid, plid, 0);
 			break;
 		default:
@@ -604,7 +604,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
@@ -613,7 +613,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			sta->plid = plid;
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
 					    plid, 0);
 			break;
 		case CNF_ACPT:
@@ -646,13 +646,13 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
 					    plid, 0);
 			break;
 		case CNF_ACPT:
@@ -661,7 +661,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			mesh_plink_inc_estab_count(sdata);
 			spin_unlock_bh(&sta->lock);
 			mpl_dbg("Mesh plink with %s ESTABLISHED\n",
-					print_mac(mac, sta->addr));
+					print_mac(mac, sta->sta.addr));
 			break;
 		default:
 			spin_unlock_bh(&sta->lock);
@@ -685,7 +685,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
@@ -694,8 +694,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			mesh_plink_inc_estab_count(sdata);
 			spin_unlock_bh(&sta->lock);
 			mpl_dbg("Mesh plink with %s ESTABLISHED\n",
-					print_mac(mac, sta->addr));
-			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
+					print_mac(mac, sta->sta.addr));
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
 					    plid, 0);
 			break;
 		default:
@@ -714,13 +714,13 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			llid = sta->llid;
 			mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata));
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
 					    plid, reason);
 			break;
 		case OPN_ACPT:
 			llid = sta->llid;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->addr, llid,
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
 					    plid, 0);
 			break;
 		default:
@@ -743,8 +743,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			llid = sta->llid;
 			reason = sta->reason;
 			spin_unlock_bh(&sta->lock);
-			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->addr, llid,
-					    plid, reason);
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr,
+					    llid, plid, reason);
 			break;
 		default:
 			spin_unlock_bh(&sta->lock);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8a2cfd3609b..35c421b89dd 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -804,7 +804,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	netif_tx_stop_all_queues(sdata->dev);
 	netif_carrier_off(sdata->dev);
 
-	ieee80211_sta_tear_down_BA_sessions(sdata, sta->addr);
+	ieee80211_sta_tear_down_BA_sessions(sdata, sta->sta.addr);
 
 	if (self_disconnected) {
 		if (deauth)
@@ -1507,7 +1507,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				printk(KERN_DEBUG "%s: updated supp_rates set "
 				    "for %s based on beacon info (0x%llx | "
 				    "0x%llx -> 0x%llx)\n",
-				    sdata->dev->name, print_mac(mac, sta->addr),
+				    sdata->dev->name,
+				    print_mac(mac, sta->sta.addr),
 				    (unsigned long long) prev_rates,
 				    (unsigned long long) supp_rates,
 				    (unsigned long long) sta->supp_rates[band]);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 8c3dda5f00b..92d898b901e 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -661,7 +661,7 @@ static void ap_sta_ps_start(struct net_device *dev, struct sta_info *sta)
 	set_and_clear_sta_flags(sta, WLAN_STA_PS, WLAN_STA_PSPOLL);
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 	printk(KERN_DEBUG "%s: STA %s aid %d enters power save mode\n",
-	       dev->name, print_mac(mac, sta->addr), sta->aid);
+	       dev->name, print_mac(mac, sta->sta.addr), sta->sta.aid);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 }
 
@@ -685,7 +685,7 @@ static int ap_sta_ps_end(struct net_device *dev, struct sta_info *sta)
 
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 	printk(KERN_DEBUG "%s: STA %s aid %d exits power save mode\n",
-	       dev->name, print_mac(mac, sta->addr), sta->aid);
+	       dev->name, print_mac(mac, sta->sta.addr), sta->sta.aid);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 
 	/* Send all buffered frames to the station */
@@ -702,7 +702,7 @@ static int ap_sta_ps_end(struct net_device *dev, struct sta_info *sta)
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 		printk(KERN_DEBUG "%s: STA %s aid %d send PS frame "
 		       "since STA not sleeping anymore\n", dev->name,
-		       print_mac(mac, sta->addr), sta->aid);
+		       print_mac(mac, sta->sta.addr), sta->sta.aid);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 		info->flags |= IEEE80211_TX_CTL_REQUEUE;
 		dev_queue_xmit(skb);
@@ -1007,7 +1007,7 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
 
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 		printk(KERN_DEBUG "STA %s aid %d: PS Poll (entries after %d)\n",
-		       print_mac(mac, rx->sta->addr), rx->sta->aid,
+		       print_mac(mac, rx->sta->sta.addr), rx->sta->sta.aid,
 		       skb_queue_len(&rx->sta->ps_tx_buf));
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 
@@ -1032,7 +1032,7 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
 		 */
 		printk(KERN_DEBUG "%s: STA %s sent PS Poll even "
 		       "though there are no buffered frames for it\n",
-		       rx->dev->name, print_mac(mac, rx->sta->addr));
+		       rx->dev->name, print_mac(mac, rx->sta->sta.addr));
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 	}
 
@@ -2140,7 +2140,7 @@ static u8 ieee80211_rx_reorder_ampdu(struct ieee80211_local *local,
 	/* if this mpdu is fragmented - terminate rx aggregation session */
 	sc = le16_to_cpu(hdr->seq_ctrl);
 	if (sc & IEEE80211_SCTL_FRAG) {
-		ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->addr,
+		ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->sta.addr,
 			tid, 0, WLAN_REASON_QSTA_REQUIRE_SETUP);
 		ret = 1;
 		goto end_reorder;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 31246d8e532..d9774ac2e0f 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -73,11 +73,11 @@ static int sta_info_hash_del(struct ieee80211_local *local,
 {
 	struct sta_info *s;
 
-	s = local->sta_hash[STA_HASH(sta->addr)];
+	s = local->sta_hash[STA_HASH(sta->sta.addr)];
 	if (!s)
 		return -ENOENT;
 	if (s == sta) {
-		rcu_assign_pointer(local->sta_hash[STA_HASH(sta->addr)],
+		rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)],
 				   s->hnext);
 		return 0;
 	}
@@ -94,13 +94,13 @@ static int sta_info_hash_del(struct ieee80211_local *local,
 
 /* protected by RCU */
 static struct sta_info *__sta_info_find(struct ieee80211_local *local,
-					u8 *addr)
+					const u8 *addr)
 {
 	struct sta_info *sta;
 
 	sta = rcu_dereference(local->sta_hash[STA_HASH(addr)]);
 	while (sta) {
-		if (compare_ether_addr(sta->addr, addr) == 0)
+		if (compare_ether_addr(sta->sta.addr, addr) == 0)
 			break;
 		sta = rcu_dereference(sta->hnext);
 	}
@@ -151,7 +151,7 @@ static void __sta_info_free(struct ieee80211_local *local,
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	printk(KERN_DEBUG "%s: Destroyed STA %s\n",
-	       wiphy_name(local->hw.wiphy), print_mac(mbuf, sta->addr));
+	       wiphy_name(local->hw.wiphy), print_mac(mbuf, sta->sta.addr));
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 	kfree(sta);
@@ -219,8 +219,8 @@ void sta_info_destroy(struct sta_info *sta)
 static void sta_info_hash_add(struct ieee80211_local *local,
 			      struct sta_info *sta)
 {
-	sta->hnext = local->sta_hash[STA_HASH(sta->addr)];
-	rcu_assign_pointer(local->sta_hash[STA_HASH(sta->addr)], sta);
+	sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)];
+	rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta);
 }
 
 struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
@@ -231,14 +231,14 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	int i;
 	DECLARE_MAC_BUF(mbuf);
 
-	sta = kzalloc(sizeof(*sta), gfp);
+	sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp);
 	if (!sta)
 		return NULL;
 
 	spin_lock_init(&sta->lock);
 	spin_lock_init(&sta->flaglock);
 
-	memcpy(sta->addr, addr, ETH_ALEN);
+	memcpy(sta->sta.addr, addr, ETH_ALEN);
 	sta->local = local;
 	sta->sdata = sdata;
 
@@ -271,7 +271,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	printk(KERN_DEBUG "%s: Allocated STA %s\n",
-	       wiphy_name(local->hw.wiphy), print_mac(mbuf, sta->addr));
+	       wiphy_name(local->hw.wiphy), print_mac(mbuf, sta->sta.addr));
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 #ifdef CONFIG_MAC80211_MESH
@@ -300,15 +300,15 @@ int sta_info_insert(struct sta_info *sta)
 		goto out_free;
 	}
 
-	if (WARN_ON(compare_ether_addr(sta->addr, sdata->dev->dev_addr) == 0 ||
-	            is_multicast_ether_addr(sta->addr))) {
+	if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->dev->dev_addr) == 0 ||
+	            is_multicast_ether_addr(sta->sta.addr))) {
 		err = -EINVAL;
 		goto out_free;
 	}
 
 	spin_lock_irqsave(&local->sta_lock, flags);
 	/* check if STA exists already */
-	if (__sta_info_find(local, sta->addr)) {
+	if (__sta_info_find(local, sta->sta.addr)) {
 		spin_unlock_irqrestore(&local->sta_lock, flags);
 		err = -EEXIST;
 		goto out_free;
@@ -325,12 +325,12 @@ int sta_info_insert(struct sta_info *sta)
 					     u.ap);
 
 		local->ops->sta_notify(local_to_hw(local), &sdata->vif,
-				       STA_NOTIFY_ADD, sta->addr);
+				       STA_NOTIFY_ADD, &sta->sta);
 	}
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	printk(KERN_DEBUG "%s: Inserted STA %s\n",
-	       wiphy_name(local->hw.wiphy), print_mac(mac, sta->addr));
+	       wiphy_name(local->hw.wiphy), print_mac(mac, sta->sta.addr));
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 	spin_unlock_irqrestore(&local->sta_lock, flags);
@@ -379,11 +379,12 @@ static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss,
 {
 	BUG_ON(!bss);
 
-	__bss_tim_set(bss, sta->aid);
+	__bss_tim_set(bss, sta->sta.aid);
 
 	if (sta->local->ops->set_tim) {
 		sta->local->tim_in_locked_section = true;
-		sta->local->ops->set_tim(local_to_hw(sta->local), sta->aid, 1);
+		sta->local->ops->set_tim(local_to_hw(sta->local),
+					 &sta->sta, true);
 		sta->local->tim_in_locked_section = false;
 	}
 }
@@ -404,11 +405,12 @@ static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss,
 {
 	BUG_ON(!bss);
 
-	__bss_tim_clear(bss, sta->aid);
+	__bss_tim_clear(bss, sta->sta.aid);
 
 	if (sta->local->ops->set_tim) {
 		sta->local->tim_in_locked_section = true;
-		sta->local->ops->set_tim(local_to_hw(sta->local), sta->aid, 0);
+		sta->local->ops->set_tim(local_to_hw(sta->local),
+					 &sta->sta, false);
 		sta->local->tim_in_locked_section = false;
 	}
 }
@@ -462,7 +464,7 @@ static void __sta_info_unlink(struct sta_info **sta)
 					     u.ap);
 
 		local->ops->sta_notify(local_to_hw(local), &sdata->vif,
-				       STA_NOTIFY_REMOVE, (*sta)->addr);
+				       STA_NOTIFY_REMOVE, &(*sta)->sta);
 	}
 
 	if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -474,7 +476,7 @@ static void __sta_info_unlink(struct sta_info **sta)
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	printk(KERN_DEBUG "%s: Removed STA %s\n",
-	       wiphy_name(local->hw.wiphy), print_mac(mbuf, (*sta)->addr));
+	       wiphy_name(local->hw.wiphy), print_mac(mbuf, (*sta)->sta.addr));
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 	/*
@@ -570,7 +572,7 @@ static void sta_info_cleanup_expire_buffered(struct ieee80211_local *local,
 		local->total_ps_buffered--;
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 		printk(KERN_DEBUG "Buffered frame expired (STA "
-		       "%s)\n", print_mac(mac, sta->addr));
+		       "%s)\n", print_mac(mac, sta->sta.addr));
 #endif
 		dev_kfree_skb(skb);
 
@@ -817,7 +819,7 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
 		if (time_after(jiffies, sta->last_rx + exp_time)) {
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 			printk(KERN_DEBUG "%s: expiring inactive STA %s\n",
-			       sdata->dev->name, print_mac(mac, sta->addr));
+			       sdata->dev->name, print_mac(mac, sta->sta.addr));
 #endif
 			__sta_info_unlink(&sta);
 			if (sta)
@@ -828,3 +830,14 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
 	list_for_each_entry_safe(sta, tmp, &tmp_list, list)
 		sta_info_destroy(sta);
 }
+
+struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_hw *hw,
+                                         const u8 *addr)
+{
+	struct sta_info *sta = __sta_info_find(hw_to_local(hw), addr);
+
+	if (!sta)
+		return NULL;
+	return &sta->sta;
+}
+EXPORT_SYMBOL(ieee80211_find_sta);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 22007990099..e7ce12dbf27 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -218,6 +218,7 @@ struct sta_ampdu_mlme {
  * @plink_timeout: TBD
  * @plink_timer: TBD
  * @debugfs: debug filesystem info
+ * @sta: station information we share with the driver
  */
 struct sta_info {
 	/* General information, mostly static */
@@ -232,8 +233,7 @@ struct sta_info {
 	spinlock_t flaglock;
 	struct ieee80211_ht_info ht_info;
 	u64 supp_rates[IEEE80211_NUM_BANDS];
-	u8 addr[ETH_ALEN];
-	u16 aid;
+
 	u16 listen_interval;
 
 	/*
@@ -327,6 +327,9 @@ struct sta_info {
 		struct dentry *agg_status;
 	} debugfs;
 #endif
+
+	/* keep last! */
+	struct ieee80211_sta sta;
 };
 
 static inline enum plink_state sta_plink_state(struct sta_info *sta)
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
index 995f7af3d25..34b32bc8f60 100644
--- a/net/mac80211/tkip.c
+++ b/net/mac80211/tkip.c
@@ -304,7 +304,7 @@ int ieee80211_tkip_decrypt_data(struct crypto_blkcipher *tfm,
 			key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
 			u8 bcast[ETH_ALEN] =
 				{0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-			u8 *sta_addr = key->sta->addr;
+			u8 *sta_addr = key->sta->sta.addr;
 
 			if (is_multicast_ether_addr(ra))
 				sta_addr = bcast;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index f4bcc589d67..07bf228d0b1 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -381,7 +381,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 		printk(KERN_DEBUG "STA %s aid %d: PS buffer (entries "
 		       "before %d)\n",
-		       print_mac(mac, sta->addr), sta->aid,
+		       print_mac(mac, sta->sta.addr), sta->sta.aid,
 		       skb_queue_len(&sta->ps_tx_buf));
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 		if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER)
@@ -392,7 +392,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 			if (net_ratelimit()) {
 				printk(KERN_DEBUG "%s: STA %s TX "
 				       "buffer full - dropping oldest frame\n",
-				       tx->dev->name, print_mac(mac, sta->addr));
+				       tx->dev->name, print_mac(mac, sta->sta.addr));
 			}
 #endif
 			dev_kfree_skb(old);
@@ -411,7 +411,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 	else if (unlikely(test_sta_flags(sta, WLAN_STA_PS))) {
 		printk(KERN_DEBUG "%s: STA %s in PS mode, but pspoll "
 		       "set -> send frame\n", tx->dev->name,
-		       print_mac(mac, sta->addr));
+		       print_mac(mac, sta->sta.addr));
 	}
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 	clear_sta_flags(sta, WLAN_STA_PSPOLL);
@@ -528,7 +528,7 @@ ieee80211_tx_h_misc(struct ieee80211_tx_data *tx)
 	sband = tx->local->hw.wiphy->bands[tx->channel->band];
 
 	if (tx->sta)
-		info->control.aid = tx->sta->aid;
+		info->control.sta = &tx->sta->sta;
 
 	if (!info->control.retry_limit) {
 		if (!is_multicast_ether_addr(hdr->addr1)) {
@@ -608,7 +608,7 @@ ieee80211_tx_h_misc(struct ieee80211_tx_data *tx)
 	}
 
 	if (tx->sta)
-		info->control.aid = tx->sta->aid;
+		info->control.sta = &tx->sta->sta;
 
 	return TX_CONTINUE;
 }
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 7229e958879..6748dedcab5 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -210,7 +210,7 @@ int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
 				DECLARE_MAC_BUF(mac);
 				printk(KERN_DEBUG "allocated aggregation queue"
 					" %d tid %d addr %s pool=0x%lX\n",
-					i, tid, print_mac(mac, sta->addr),
+					i, tid, print_mac(mac, sta->sta.addr),
 					local->queue_pool[0]);
 			}
 #endif /* CONFIG_MAC80211_HT_DEBUG */
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 78021780b88..37ae9a959f6 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -256,7 +256,7 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
 
 	res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm,
 					  key, skb->data + hdrlen,
-					  skb->len - hdrlen, rx->sta->addr,
+					  skb->len - hdrlen, rx->sta->sta.addr,
 					  hdr->addr1, hwaccel, rx->queue,
 					  &rx->tkip_iv32,
 					  &rx->tkip_iv16);
-- 
cgit v1.2.3


From 95dac040041723d0c0ab245642c1b9802f12cc8d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 02:03:28 +0200
Subject: mac80211: small rate control changes

This patch fixes mac80211 to not rely on the rate control
algorithm to update sta->tx_retry_failed and sta->tx_retry_count
(even if we don't currently use them), removes a number of
completely unused values we don't even show in debugfs and
changes the code in ieee80211_tx_status() to not look up the
sta_info repeatedly.

The only behaviour change here would be not calling the rate
control function rate_control_tx_status() when no sta_info is
found, but all rate control algorithms ignore such calls anyway.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c             | 50 ++++++++++++++++++++---------------------
 net/mac80211/rc80211_pid_algo.c | 11 ---------
 net/mac80211/sta_info.h         |  7 ------
 3 files changed, 25 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index dd838b725af..c307dba7ec0 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -546,29 +546,27 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 
 	rcu_read_lock();
 
-	if (info->status.excessive_retries) {
-		sta = sta_info_get(local, hdr->addr1);
-		if (sta) {
-			if (test_sta_flags(sta, WLAN_STA_PS)) {
-				/*
-				 * The STA is in power save mode, so assume
-				 * that this TX packet failed because of that.
-				 */
-				ieee80211_handle_filtered_frame(local, sta, skb);
-				rcu_read_unlock();
-				return;
-			}
+	sta = sta_info_get(local, hdr->addr1);
+
+	if (sta) {
+		if (info->status.excessive_retries &&
+		    test_sta_flags(sta, WLAN_STA_PS)) {
+			/*
+			 * The STA is in power save mode, so assume
+			 * that this TX packet failed because of that.
+			 */
+			ieee80211_handle_filtered_frame(local, sta, skb);
+			rcu_read_unlock();
+			return;
 		}
-	}
 
-	fc = hdr->frame_control;
+		fc = hdr->frame_control;
+
+		if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) &&
+		    (ieee80211_is_data_qos(fc))) {
+			u16 tid, ssn;
+			u8 *qc;
 
-	if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) &&
-	    (ieee80211_is_data_qos(fc))) {
-		u16 tid, ssn;
-		u8 *qc;
-		sta = sta_info_get(local, hdr->addr1);
-		if (sta) {
 			qc = ieee80211_get_qos_ctl(hdr);
 			tid = qc[0] & 0xf;
 			ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10)
@@ -576,17 +574,19 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 			ieee80211_send_bar(sta->sdata, hdr->addr1,
 					   tid, ssn);
 		}
-	}
 
-	if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) {
-		sta = sta_info_get(local, hdr->addr1);
-		if (sta) {
+		if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) {
 			ieee80211_handle_filtered_frame(local, sta, skb);
 			rcu_read_unlock();
 			return;
+		} else {
+			if (info->status.excessive_retries)
+				sta->tx_retry_failed++;
+			sta->tx_retry_count += info->status.retry_count;
 		}
-	} else
+
 		rate_control_tx_status(local->mdev, skb);
+	}
 
 	rcu_read_unlock();
 
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index 21e1942ea97..94867860c3e 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -282,17 +282,6 @@ static void rate_control_pid_tx_status(void *priv, struct net_device *dev,
 		spinfo->tx_num_xmit++;
 	}
 
-	if (info->status.excessive_retries) {
-		sta->tx_retry_failed++;
-		sta->tx_num_consecutive_failures++;
-		sta->tx_num_mpdu_fail++;
-	} else {
-		sta->tx_num_consecutive_failures = 0;
-		sta->tx_num_mpdu_ok++;
-	}
-	sta->tx_retry_count += info->status.retry_count;
-	sta->tx_num_mpdu_fail += info->status.retry_count;
-
 	/* Update PID controller state. */
 	period = (HZ * pinfo->sampling_period + 500) / 1000;
 	if (!period)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index e7ce12dbf27..4a9b96eeb68 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -195,9 +195,6 @@ struct sta_ampdu_mlme {
  * @tx_filtered_count: TBD
  * @tx_retry_failed: TBD
  * @tx_retry_count: TBD
- * @tx_num_consecutive_failures: TBD
- * @tx_num_mpdu_ok: TBD
- * @tx_num_mpdu_fail: TBD
  * @fail_avg: moving percentage of failed MSDUs
  * @tx_packets: number of RX/TX MSDUs
  * @tx_bytes: TBD
@@ -273,10 +270,6 @@ struct sta_info {
 	/* Updated from TX status path only, no locking requirements */
 	unsigned long tx_filtered_count;
 	unsigned long tx_retry_failed, tx_retry_count;
-	/* TODO: update in generic code not rate control? */
-	u32 tx_num_consecutive_failures;
-	u32 tx_num_mpdu_ok;
-	u32 tx_num_mpdu_fail;
 	/* moving percentage of failed MSDUs */
 	unsigned int fail_avg;
 
-- 
cgit v1.2.3


From b7e35008815a1c39123f4dd53b430788e2e18da4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 02:22:58 +0200
Subject: mac80211: move last_txrate_idx into RC algorithms

This variable in sta_info is only used in a meaningful way
by the Intel RC algorithms, so move it into those.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rc80211_pid_algo.c | 2 --
 net/mac80211/sta_info.h         | 2 --
 2 files changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index 94867860c3e..24e44f52130 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -329,8 +329,6 @@ static void rate_control_pid_get_rate(void *priv, struct net_device *dev,
 	if (rateidx >= sband->n_bitrates)
 		rateidx = sband->n_bitrates - 1;
 
-	sta->last_txrate_idx = rateidx;
-
 	rcu_read_unlock();
 
 	sel->rate_idx = rateidx;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 4a9b96eeb68..df42d181545 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -200,7 +200,6 @@ struct sta_ampdu_mlme {
  * @tx_bytes: TBD
  * @tx_fragments: number of transmitted MPDUs
  * @txrate_idx: TBD
- * @last_txrate_idx: TBD
  * @tid_seq: TBD
  * @wme_tx_queue: TBD
  * @ampdu_mlme: TBD
@@ -278,7 +277,6 @@ struct sta_info {
 	unsigned long tx_bytes;
 	unsigned long tx_fragments;
 	int txrate_idx;
-	int last_txrate_idx;
 	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
 #ifdef CONFIG_MAC80211_DEBUG_COUNTERS
 	unsigned int wme_tx_queue[NUM_RX_DATA_QUEUES];
-- 
cgit v1.2.3


From 323ce79a9cdbf838ea577677b1ddace8e0b4d4c6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 02:45:11 +0200
Subject: mac80211: share sta->supp_rates

As more preparation for a saner rate control algorithm API,
share the supported rates bitmap in the public API.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c        |  2 +-
 net/mac80211/mesh_plink.c |  4 ++--
 net/mac80211/mlme.c       | 12 ++++++------
 net/mac80211/rate.h       |  2 +-
 net/mac80211/sta_info.h   |  2 --
 5 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index ed5e77ce627..47988d2eb15 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -667,7 +667,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 					rates |= BIT(j);
 			}
 		}
-		sta->supp_rates[local->oper_channel->band] = rates;
+		sta->sta.supp_rates[local->oper_channel->band] = rates;
 	}
 
 	if (params->ht_capa) {
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index debf7834dbc..faac101c0f8 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -106,7 +106,7 @@ static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata,
 		return NULL;
 
 	sta->flags = WLAN_STA_AUTHORIZED;
-	sta->supp_rates[local->hw.conf.channel->band] = rates;
+	sta->sta.supp_rates[local->hw.conf.channel->band] = rates;
 
 	return sta;
 }
@@ -243,7 +243,7 @@ void mesh_neighbour_update(u8 *hw_addr, u64 rates, struct ieee80211_sub_if_data
 	}
 
 	sta->last_rx = jiffies;
-	sta->supp_rates[local->hw.conf.channel->band] = rates;
+	sta->sta.supp_rates[local->hw.conf.channel->band] = rates;
 	if (peer_accepting_plinks && sta->plink_state == PLINK_LISTEN &&
 			sdata->u.mesh.accepting_plinks &&
 			sdata->u.mesh.mshcfg.auto_open_plinks)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 35c421b89dd..c049f336e58 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1301,7 +1301,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	sta->supp_rates[local->hw.conf.channel->band] = rates;
+	sta->sta.supp_rates[local->hw.conf.channel->band] = rates;
 	sdata->bss_conf.basic_rates = basic_rates;
 
 	/* cf. IEEE 802.11 9.2.12 */
@@ -1497,13 +1497,13 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		if (sta) {
 			u64 prev_rates;
 
-			prev_rates = sta->supp_rates[band];
+			prev_rates = sta->sta.supp_rates[band];
 			/* make sure mandatory rates are always added */
-			sta->supp_rates[band] = supp_rates |
+			sta->sta.supp_rates[band] = supp_rates |
 				ieee80211_mandatory_rates(local, band);
 
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
-			if (sta->supp_rates[band] != prev_rates)
+			if (sta->sta.supp_rates[band] != prev_rates)
 				printk(KERN_DEBUG "%s: updated supp_rates set "
 				    "for %s based on beacon info (0x%llx | "
 				    "0x%llx -> 0x%llx)\n",
@@ -1511,7 +1511,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				    print_mac(mac, sta->sta.addr),
 				    (unsigned long long) prev_rates,
 				    (unsigned long long) supp_rates,
-				    (unsigned long long) sta->supp_rates[band]);
+				    (unsigned long long) sta->sta.supp_rates[band]);
 #endif
 		} else {
 			ieee80211_ibss_add_sta(sdata, NULL, mgmt->bssid,
@@ -2339,7 +2339,7 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 	set_sta_flags(sta, WLAN_STA_AUTHORIZED);
 
 	/* make sure mandatory rates are always added */
-	sta->supp_rates[band] = supp_rates |
+	sta->sta.supp_rates[band] = supp_rates |
 			ieee80211_mandatory_rates(local, band);
 
 	rate_control_rate_init(sta, local);
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index ede7ab56f65..5f18c27eb90 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -134,7 +134,7 @@ static inline int rate_supported(struct sta_info *sta,
 				 enum ieee80211_band band,
 				 int index)
 {
-	return (sta == NULL || sta->supp_rates[band] & BIT(index));
+	return (sta == NULL || sta->sta.supp_rates[band] & BIT(index));
 }
 
 static inline s8
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index df42d181545..4dafa044b2f 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -168,7 +168,6 @@ struct sta_ampdu_mlme {
  *	in the header file.
  * @flaglock: spinlock for flags accesses
  * @ht_info: HT capabilities of this STA
- * @supp_rates: Bitmap of supported rates (per band)
  * @addr: MAC address of this STA
  * @aid: STA's unique AID (1..2007, 0 = not assigned yet),
  *	only used in AP (and IBSS?) mode
@@ -228,7 +227,6 @@ struct sta_info {
 	spinlock_t lock;
 	spinlock_t flaglock;
 	struct ieee80211_ht_info ht_info;
-	u64 supp_rates[IEEE80211_NUM_BANDS];
 
 	u16 listen_interval;
 
-- 
cgit v1.2.3


From ae17e986091637e7ef5a8224c7b689029b105131 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 03:04:36 +0200
Subject: mac80211: move txrate_idx into RC algorithms

The sta_info->txrate_idx member isn't used by all RC algorithms
in the way it was intended to be used, move it into those that
require it (only PID) and keep track in the core code of which
rate was last used for reporting to userspace and the mesh MLME.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mesh_hwmp.c        |  2 +-
 net/mac80211/rc80211_pid.h      |  2 ++
 net/mac80211/rc80211_pid_algo.c | 33 +++++++++++++++++++--------------
 net/mac80211/sta_info.h         |  2 +-
 net/mac80211/tx.c               |  2 ++
 net/mac80211/wext.c             |  4 ++--
 6 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 15a5c99270a..501c7831adb 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -223,7 +223,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local,
 	/* bitrate is in units of 100 Kbps, while we need rate in units of
 	 * 1Mbps. This will be corrected on tx_time computation.
 	 */
-	rate = sband->bitrates[sta->txrate_idx].bitrate;
+	rate = sband->bitrates[sta->last_txrate_idx].bitrate;
 	tx_time = (device_constant + 10 * test_frame_len / rate);
 	estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err));
 	result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ;
diff --git a/net/mac80211/rc80211_pid.h b/net/mac80211/rc80211_pid.h
index 0a9135b974b..ffafc5da572 100644
--- a/net/mac80211/rc80211_pid.h
+++ b/net/mac80211/rc80211_pid.h
@@ -180,6 +180,8 @@ struct rc_pid_sta_info {
 	u32 tx_num_failed;
 	u32 tx_num_xmit;
 
+	int txrate_idx;
+
 	/* Average failed frames percentage error (i.e. actual vs. target
 	 * percentage), scaled by RC_PID_SMOOTHING. This value is computed
 	 * using using an exponential weighted average technique:
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index 24e44f52130..bc1c4569caa 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -75,7 +75,8 @@ static void rate_control_pid_adjust_rate(struct ieee80211_local *local,
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_supported_band *sband;
 	int cur_sorted, new_sorted, probe, tmp, n_bitrates, band;
-	int cur = sta->txrate_idx;
+	struct rc_pid_sta_info *spinfo = (void *)sta->rate_ctrl_priv;
+	int cur = spinfo->txrate_idx;
 
 	sdata = sta->sdata;
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
@@ -111,7 +112,7 @@ static void rate_control_pid_adjust_rate(struct ieee80211_local *local,
 	/* Fit the rate found to the nearest supported rate. */
 	do {
 		if (rate_supported(sta, band, rinfo[tmp].index)) {
-			sta->txrate_idx = rinfo[tmp].index;
+			spinfo->txrate_idx = rinfo[tmp].index;
 			break;
 		}
 		if (adj < 0)
@@ -121,9 +122,9 @@ static void rate_control_pid_adjust_rate(struct ieee80211_local *local,
 	} while (tmp < n_bitrates && tmp >= 0);
 
 #ifdef CONFIG_MAC80211_DEBUGFS
-	rate_control_pid_event_rate_change(
-		&((struct rc_pid_sta_info *)sta->rate_ctrl_priv)->events,
-		sta->txrate_idx, sband->bitrates[sta->txrate_idx].bitrate);
+	rate_control_pid_event_rate_change(&spinfo->events,
+		spinfo->txrate_idx,
+		sband->bitrates[spinfo->txrate_idx].bitrate);
 #endif
 }
 
@@ -190,16 +191,16 @@ static void rate_control_pid_sample(struct rc_pid_info *pinfo,
 	spinfo->tx_num_failed = 0;
 
 	/* If we just switched rate, update the rate behaviour info. */
-	if (pinfo->oldrate != sta->txrate_idx) {
+	if (pinfo->oldrate != spinfo->txrate_idx) {
 
 		i = rinfo[pinfo->oldrate].rev_index;
-		j = rinfo[sta->txrate_idx].rev_index;
+		j = rinfo[spinfo->txrate_idx].rev_index;
 
 		tmp = (pf - spinfo->last_pf);
 		tmp = RC_PID_DO_ARITH_RIGHT_SHIFT(tmp, RC_PID_ARITH_SHIFT);
 
 		rinfo[j].diff = rinfo[i].diff + tmp;
-		pinfo->oldrate = sta->txrate_idx;
+		pinfo->oldrate = spinfo->txrate_idx;
 	}
 	rate_control_pid_normalize(pinfo, sband->n_bitrates);
 
@@ -252,19 +253,20 @@ static void rate_control_pid_tx_status(void *priv, struct net_device *dev,
 	if (!sta)
 		goto unlock;
 
+	spinfo = sta->rate_ctrl_priv;
+
 	/* Don't update the state if we're not controlling the rate. */
 	sdata = sta->sdata;
 	if (sdata->force_unicast_rateidx > -1) {
-		sta->txrate_idx = sdata->max_ratectrl_rateidx;
+		spinfo->txrate_idx = sdata->max_ratectrl_rateidx;
 		goto unlock;
 	}
 
 	/* Ignore all frames that were sent with a different rate than the rate
 	 * we currently advise mac80211 to use. */
-	if (info->tx_rate_idx != sta->txrate_idx)
+	if (info->tx_rate_idx != spinfo->txrate_idx)
 		goto unlock;
 
-	spinfo = sta->rate_ctrl_priv;
 	spinfo->tx_num_xmit++;
 
 #ifdef CONFIG_MAC80211_DEBUGFS
@@ -301,6 +303,7 @@ static void rate_control_pid_get_rate(void *priv, struct net_device *dev,
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct ieee80211_sub_if_data *sdata;
+	struct rc_pid_sta_info *spinfo;
 	struct sta_info *sta;
 	int rateidx;
 	u16 fc;
@@ -321,10 +324,11 @@ static void rate_control_pid_get_rate(void *priv, struct net_device *dev,
 
 	/* If a forced rate is in effect, select it. */
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	spinfo = (struct rc_pid_sta_info *)sta->rate_ctrl_priv;
 	if (sdata->force_unicast_rateidx > -1)
-		sta->txrate_idx = sdata->force_unicast_rateidx;
+		spinfo->txrate_idx = sdata->force_unicast_rateidx;
 
-	rateidx = sta->txrate_idx;
+	rateidx = spinfo->txrate_idx;
 
 	if (rateidx >= sband->n_bitrates)
 		rateidx = sband->n_bitrates - 1;
@@ -349,9 +353,10 @@ static void rate_control_pid_rate_init(void *priv, void *priv_sta,
 	 * Until that method is implemented, we will use the lowest supported
 	 * rate as a workaround. */
 	struct ieee80211_supported_band *sband;
+	struct rc_pid_sta_info *spinfo = (void *)sta->rate_ctrl_priv;
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-	sta->txrate_idx = rate_lowest_index(local, sband, sta);
+	spinfo->txrate_idx = rate_lowest_index(local, sband, sta);
 	sta->fail_avg = 0;
 }
 
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 4dafa044b2f..5d8fabf7a68 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -274,7 +274,7 @@ struct sta_info {
 	unsigned long tx_packets;
 	unsigned long tx_bytes;
 	unsigned long tx_fragments;
-	int txrate_idx;
+	unsigned int last_txrate_idx;
 	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
 #ifdef CONFIG_MAC80211_DEBUG_COUNTERS
 	unsigned int wme_tx_queue[NUM_RX_DATA_QUEUES];
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 07bf228d0b1..7468495d6f9 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -485,6 +485,8 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 
 	if (likely(tx->rate_idx < 0)) {
 		rate_control_get_rate(tx->dev, sband, tx->skb, &rsel);
+		if (tx->sta)
+			tx->sta->last_txrate_idx = rsel.rate_idx;
 		tx->rate_idx = rsel.rate_idx;
 		if (unlikely(rsel.probe_idx >= 0)) {
 			info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index aef9707700f..7e0d53abde2 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -636,8 +636,8 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev,
 
 	sta = sta_info_get(local, sdata->u.sta.bssid);
 
-	if (sta && sta->txrate_idx < sband->n_bitrates)
-		rate->value = sband->bitrates[sta->txrate_idx].bitrate;
+	if (sta && sta->last_txrate_idx < sband->n_bitrates)
+		rate->value = sband->bitrates[sta->last_txrate_idx].bitrate;
 	else
 		rate->value = 0;
 
-- 
cgit v1.2.3


From 687c7c0807371aeaa94ff2fff511eeb326b5c5de Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 03:14:11 +0200
Subject: mac80211: share sta_info->ht_info

Rate control algorithms may need access to a station's
HT capabilities, so share the ht_info struct in the
public station API.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c      | 2 +-
 net/mac80211/mlme.c     | 4 ++--
 net/mac80211/sta_info.h | 2 --
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 47988d2eb15..e2574885db4 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -672,7 +672,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 
 	if (params->ht_capa) {
 		ieee80211_ht_cap_ie_to_ht_info(params->ht_capa,
-					       &sta->ht_info);
+					       &sta->sta.ht_info);
 	}
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) && params->plink_action) {
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c049f336e58..8611a8318c9 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1316,11 +1316,11 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_ht_bss_info bss_info;
 		ieee80211_ht_cap_ie_to_ht_info(
 				(struct ieee80211_ht_cap *)
-				elems.ht_cap_elem, &sta->ht_info);
+				elems.ht_cap_elem, &sta->sta.ht_info);
 		ieee80211_ht_addt_info_ie_to_ht_bss_info(
 				(struct ieee80211_ht_addt_info *)
 				elems.ht_info_elem, &bss_info);
-		ieee80211_handle_ht(local, 1, &sta->ht_info, &bss_info);
+		ieee80211_handle_ht(local, 1, &sta->sta.ht_info, &bss_info);
 	}
 
 	rate_control_rate_init(sta, local);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 5d8fabf7a68..b773c7b8d29 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -167,7 +167,6 @@ struct sta_ampdu_mlme {
  * @lock: used for locking all fields that require locking, see comments
  *	in the header file.
  * @flaglock: spinlock for flags accesses
- * @ht_info: HT capabilities of this STA
  * @addr: MAC address of this STA
  * @aid: STA's unique AID (1..2007, 0 = not assigned yet),
  *	only used in AP (and IBSS?) mode
@@ -226,7 +225,6 @@ struct sta_info {
 	void *rate_ctrl_priv;
 	spinlock_t lock;
 	spinlock_t flaglock;
-	struct ieee80211_ht_info ht_info;
 
 	u16 listen_interval;
 
-- 
cgit v1.2.3


From 3061307013267c2c75efae3925f461858d832101 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 11 Sep 2008 05:27:40 +0200
Subject: mac80211: pass AP vif pointer for VLANs

We cannot pass a VLAN vif pointer to the driver since those are
entirely virtual and we never tell the driver.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/tx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 7468495d6f9..698c8233e6b 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1351,6 +1351,10 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 		return 0;
 	}
 
+	if (osdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		osdata = container_of(osdata->bss,
+				      struct ieee80211_sub_if_data,
+				      u.ap);
 	info->control.vif = &osdata->vif;
 	ret = ieee80211_tx(odev, skb);
 	dev_put(odev);
-- 
cgit v1.2.3


From bed7aac9416f50425d2200df32bcc9bf248ff8cb Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Tue, 26 Aug 2008 11:58:01 -0300
Subject: rfkill: remove transmitter blocking on suspend

Currently, rfkill would stand in the way of properly supporting wireless
devices that are capable of waking the system up from sleep or hibernation
when they receive a special wireless message.  It would also get in the way
of mesh devices that need to remain operational even during platform
suspend.

To avoid that, stop trying to block the transmitters on the rfkill class
suspend handler.

Drivers that need rfkill's older behaviour will have to implement it by
themselves in their own suspend handling.

Do note that rfkill *will* attempt to restore the transmitter state on
resume in any situation.  This happens after the driver's resume method is
called by the suspend core (class devices resume after the devices they are
attached to have been resumed).

The following drivers need to check if they need to explicitly block
their transmitters in their own suspend handlers (maintainers Cc'd):
	arch/arm/mach-pxa/tosa-bt.c
	drivers/net/usb/hso.c
	drivers/net/wireless/rt2x00/* (USB might need it?)
	drivers/net/wireless/b43/ (SSB over USB might need it?)
	drivers/misc/hp-wmi.c
	eeepc-laptop w/rfkill support (not in mainline yet)
	Compal laptop w/rfkill support (not in mainline yet)
	toshiba-acpi w/rfkill support (not in mainline yet)

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Ivo van Doorn <IvDoorn@gmail.com>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Andrew Bird <ajb@spheresystems.co.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Cezary Jackiewicz <cezary.jackiewicz@gmail.com>
Cc: Philip Langdale <philipl@overt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index d5735799ccd..ea0dc04b3c7 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -512,21 +512,9 @@ static void rfkill_release(struct device *dev)
 #ifdef CONFIG_PM
 static int rfkill_suspend(struct device *dev, pm_message_t state)
 {
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	if (dev->power.power_state.event != state.event) {
-		if (state.event & PM_EVENT_SLEEP) {
-			/* Stop transmitter, keep state, no notifies */
-			update_rfkill_state(rfkill);
-
-			mutex_lock(&rfkill->mutex);
-			rfkill->toggle_radio(rfkill->data,
-						RFKILL_STATE_SOFT_BLOCKED);
-			mutex_unlock(&rfkill->mutex);
-		}
-
+	/* mark class device as suspended */
+	if (dev->power.power_state.event != state.event)
 		dev->power.power_state = state;
-	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 25d834e16294c8dfd923dae6bdb8a055391a99a5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 12 Sep 2008 22:52:47 +0200
Subject: mac80211: fix virtual interfaces vs. injection

Currently, virtual interface pointers passed to drivers might be
from monitor interfaces and as such completely uninitialised
because we do not tell the driver about monitor interfaces when
those are created. Instead of passing them, we should therefore
indicate to the driver that there is no information; do that by
passing a NULL value and adjust drivers to cope with it.

As a result, some mac80211 API functions also need to cope with
a NULL vif pointer so drivers can still call them unconditionally.

Also, when injecting frames we really don't want to pass NULL all
the time, if we know we are the source address of a frame and have
a local interface for that address, we can to use that interface.
This also helps with processing the frame correctly for that
interface which will help the 802.11w implementation. It's not
entirely correct for VLANs or WDS interfaces because there the MAC
address isn't unique, but it's already a lot better than what we
do now.

Finally, when injecting without a matching local interface, don't
assign sequence numbers at all.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/tx.c   | 64 +++++++++++++++++++++++++++++++++++++++++++++++++----
 net/mac80211/util.c | 37 +++++++++++++++++++++----------
 2 files changed, 85 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 698c8233e6b..c12f361d718 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -624,7 +624,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	u8 *qc;
 	int tid;
 
-	/* only for injected frames */
+	/*
+	 * Packet injection may want to control the sequence
+	 * number, if we have no matching interface then we
+	 * neither assign one ourselves nor ask the driver to.
+	 */
+	if (unlikely(!info->control.vif))
+		return TX_CONTINUE;
+
 	if (unlikely(ieee80211_is_ctl(hdr->frame_control)))
 		return TX_CONTINUE;
 
@@ -849,7 +856,6 @@ __ieee80211_parse_tx_radiotap(struct ieee80211_tx_data *tx,
 	sband = tx->local->hw.wiphy->bands[tx->channel->band];
 
 	skb->do_not_encrypt = 1;
-	info->flags |= IEEE80211_TX_CTL_INJECTED;
 	tx->flags &= ~IEEE80211_TX_FRAGMENTED;
 
 	/*
@@ -981,7 +987,7 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 
 	/* process and remove the injection radiotap header */
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (unlikely(sdata->vif.type == NL80211_IFTYPE_MONITOR)) {
+	if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED)) {
 		if (__ieee80211_parse_tx_radiotap(tx, skb) == TX_DROP)
 			return TX_DROP;
 
@@ -1300,6 +1306,11 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 	struct ieee80211_sub_if_data *osdata;
 	int headroom;
 	bool may_encrypt;
+	enum {
+		NOT_MONITOR,
+		FOUND_SDATA,
+		UNKNOWN_ADDRESS,
+	} monitor_iface = NOT_MONITOR;
 	int ret;
 
 	if (skb->iif)
@@ -1335,6 +1346,50 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 				IEEE80211_IFSTA_MESH_CTR_INC(&osdata->u.mesh,
 							     fwded_frames);
 		}
+	} else if (unlikely(osdata->vif.type == NL80211_IFTYPE_MONITOR)) {
+		struct ieee80211_sub_if_data *sdata;
+		struct ieee80211_local *local = osdata->local;
+		struct ieee80211_hdr *hdr;
+		int hdrlen;
+		u16 len_rthdr;
+
+		info->flags |= IEEE80211_TX_CTL_INJECTED;
+		monitor_iface = UNKNOWN_ADDRESS;
+
+		len_rthdr = ieee80211_get_radiotap_len(skb->data);
+		hdr = (struct ieee80211_hdr *)skb->data + len_rthdr;
+		hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+		/* check the header is complete in the frame */
+		if (likely(skb->len >= len_rthdr + hdrlen)) {
+			/*
+			 * We process outgoing injected frames that have a
+			 * local address we handle as though they are our
+			 * own frames.
+			 * This code here isn't entirely correct, the local
+			 * MAC address is not necessarily enough to find
+			 * the interface to use; for that proper VLAN/WDS
+			 * support we will need a different mechanism.
+			 */
+
+			rcu_read_lock();
+			list_for_each_entry_rcu(sdata, &local->interfaces,
+						list) {
+				if (!netif_running(sdata->dev))
+					continue;
+				if (compare_ether_addr(sdata->dev->dev_addr,
+						       hdr->addr2)) {
+					dev_hold(sdata->dev);
+					dev_put(odev);
+					osdata = sdata;
+					odev = osdata->dev;
+					skb->iif = sdata->dev->ifindex;
+					monitor_iface = FOUND_SDATA;
+					break;
+				}
+			}
+			rcu_read_unlock();
+		}
 	}
 
 	may_encrypt = !skb->do_not_encrypt;
@@ -1355,7 +1410,8 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 		osdata = container_of(osdata->bss,
 				      struct ieee80211_sub_if_data,
 				      u.ap);
-	info->control.vif = &osdata->vif;
+	if (likely(monitor_iface != UNKNOWN_ADDRESS))
+		info->control.vif = &osdata->vif;
 	ret = ieee80211_tx(odev, skb);
 	dev_put(odev);
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 6eb222369bc..f32561ec224 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -231,16 +231,21 @@ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
 					struct ieee80211_rate *rate)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_sub_if_data *sdata;
 	u16 dur;
 	int erp;
+	bool short_preamble = false;
 
 	erp = 0;
-	if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
-		erp = rate->flags & IEEE80211_RATE_ERP_G;
+	if (vif) {
+		sdata = vif_to_sdata(vif);
+		short_preamble = sdata->bss_conf.use_short_preamble;
+		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+			erp = rate->flags & IEEE80211_RATE_ERP_G;
+	}
 
 	dur = ieee80211_frame_duration(local, frame_len, rate->bitrate, erp,
-				       sdata->bss_conf.use_short_preamble);
+				       short_preamble);
 
 	return cpu_to_le16(dur);
 }
@@ -252,7 +257,7 @@ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_rate *rate;
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_sub_if_data *sdata;
 	bool short_preamble;
 	int erp;
 	u16 dur;
@@ -260,13 +265,17 @@ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 
-	short_preamble = sdata->bss_conf.use_short_preamble;
+	short_preamble = false;
 
 	rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];
 
 	erp = 0;
-	if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
-		erp = rate->flags & IEEE80211_RATE_ERP_G;
+	if (vif) {
+		sdata = vif_to_sdata(vif);
+		short_preamble = sdata->bss_conf.use_short_preamble;
+		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+			erp = rate->flags & IEEE80211_RATE_ERP_G;
+	}
 
 	/* CTS duration */
 	dur = ieee80211_frame_duration(local, 10, rate->bitrate,
@@ -289,7 +298,7 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_rate *rate;
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_sub_if_data *sdata;
 	bool short_preamble;
 	int erp;
 	u16 dur;
@@ -297,12 +306,16 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 
-	short_preamble = sdata->bss_conf.use_short_preamble;
+	short_preamble = false;
 
 	rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];
 	erp = 0;
-	if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
-		erp = rate->flags & IEEE80211_RATE_ERP_G;
+	if (vif) {
+		sdata = vif_to_sdata(vif);
+		short_preamble = sdata->bss_conf.use_short_preamble;
+		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+			erp = rate->flags & IEEE80211_RATE_ERP_G;
+	}
 
 	/* Data frame duration */
 	dur = ieee80211_frame_duration(local, frame_len, rate->bitrate,
-- 
cgit v1.2.3


From 9222963f7c81e2897833a43066c9db75350c8586 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 12 Sep 2008 09:53:27 +0200
Subject: mac80211: fix sta_info kernel-doc warning

Sorry, forgot to run kernel-doc and just got the output from the nightly
run by email, this fixes a warning which I introduced when doing the
first RC API cleanups.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/sta_info.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index b773c7b8d29..daedfa9e1c6 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -197,7 +197,7 @@ struct sta_ampdu_mlme {
  * @tx_packets: number of RX/TX MSDUs
  * @tx_bytes: TBD
  * @tx_fragments: number of transmitted MPDUs
- * @txrate_idx: TBD
+ * @last_txrate_idx: Index of the last used transmit rate
  * @tid_seq: TBD
  * @wme_tx_queue: TBD
  * @ampdu_mlme: TBD
-- 
cgit v1.2.3


From c1b6cf4ee0fb8a3698c563e101a60f9ee4910de0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 12 Sep 2008 11:05:39 +0200
Subject: mac80211: remove beacon counters

The beacon counters mac80211 keeps are only used for debugfs,
unfortunately, they are incorrect for many hardware designs,
namely any design that has a beacon template. Hence, remove the
counters so we don't create the impression they are usable.

This also allows removing the beacon MESH #ifdef again.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/debugfs_netdev.c | 6 ------
 net/mac80211/ieee80211_i.h    | 5 -----
 net/mac80211/tx.c             | 9 ---------
 3 files changed, 20 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 1b33cad24ab..2a451562377 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -173,7 +173,6 @@ IEEE80211_IF_FILE(assoc_tries, u.sta.assoc_tries, DEC);
 IEEE80211_IF_FILE(auth_algs, u.sta.auth_algs, HEX);
 IEEE80211_IF_FILE(auth_alg, u.sta.auth_alg, DEC);
 IEEE80211_IF_FILE(auth_transaction, u.sta.auth_transaction, DEC);
-IEEE80211_IF_FILE(num_beacons_sta, u.sta.num_beacons, DEC);
 
 static ssize_t ieee80211_if_fmt_flags(
 	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -192,7 +191,6 @@ __IEEE80211_IF_FILE(flags);
 /* AP attributes */
 IEEE80211_IF_FILE(num_sta_ps, u.ap.num_sta_ps, ATOMIC);
 IEEE80211_IF_FILE(dtim_count, u.ap.dtim_count, DEC);
-IEEE80211_IF_FILE(num_beacons, u.ap.num_beacons, DEC);
 
 static ssize_t ieee80211_if_fmt_num_buffered_multicast(
 	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -265,7 +263,6 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata)
 	DEBUGFS_ADD(auth_alg, sta);
 	DEBUGFS_ADD(auth_transaction, sta);
 	DEBUGFS_ADD(flags, sta);
-	DEBUGFS_ADD(num_beacons_sta, sta);
 }
 
 static void add_ap_files(struct ieee80211_sub_if_data *sdata)
@@ -276,7 +273,6 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
 
 	DEBUGFS_ADD(num_sta_ps, ap);
 	DEBUGFS_ADD(dtim_count, ap);
-	DEBUGFS_ADD(num_beacons, ap);
 	DEBUGFS_ADD(num_buffered_multicast, ap);
 }
 
@@ -398,7 +394,6 @@ static void del_sta_files(struct ieee80211_sub_if_data *sdata)
 	DEBUGFS_DEL(auth_alg, sta);
 	DEBUGFS_DEL(auth_transaction, sta);
 	DEBUGFS_DEL(flags, sta);
-	DEBUGFS_DEL(num_beacons_sta, sta);
 }
 
 static void del_ap_files(struct ieee80211_sub_if_data *sdata)
@@ -409,7 +404,6 @@ static void del_ap_files(struct ieee80211_sub_if_data *sdata)
 
 	DEBUGFS_DEL(num_sta_ps, ap);
 	DEBUGFS_DEL(dtim_count, ap);
-	DEBUGFS_DEL(num_beacons, ap);
 	DEBUGFS_DEL(num_buffered_multicast, ap);
 }
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6bd6a6306da..3912fba6d3d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -229,7 +229,6 @@ struct ieee80211_if_ap {
 	struct sk_buff_head ps_bc_buf;
 	atomic_t num_sta_ps; /* number of stations in PS mode */
 	int dtim_count;
-	int num_beacons; /* number of TXed beacon frames for this BSS */
 };
 
 struct ieee80211_if_wds {
@@ -352,7 +351,6 @@ struct ieee80211_if_sta {
 	u32 supp_rates_bits[IEEE80211_NUM_BANDS];
 
 	int wmm_last_param_set;
-	int num_beacons; /* number of TXed beacon frames by this STA */
 };
 
 struct ieee80211_if_mesh {
@@ -388,7 +386,6 @@ struct ieee80211_if_mesh {
 	struct mesh_config mshcfg;
 	u32 mesh_seqnum;
 	bool accepting_plinks;
-	int num_beacons;
 };
 
 #ifdef CONFIG_MAC80211_MESH
@@ -484,7 +481,6 @@ struct ieee80211_sub_if_data {
 			struct dentry *auth_alg;
 			struct dentry *auth_transaction;
 			struct dentry *flags;
-			struct dentry *num_beacons_sta;
 			struct dentry *force_unicast_rateidx;
 			struct dentry *max_ratectrl_rateidx;
 		} sta;
@@ -492,7 +488,6 @@ struct ieee80211_sub_if_data {
 			struct dentry *drop_unencrypted;
 			struct dentry *num_sta_ps;
 			struct dentry *dtim_count;
-			struct dentry *num_beacons;
 			struct dentry *force_unicast_rateidx;
 			struct dentry *max_ratectrl_rateidx;
 			struct dentry *num_buffered_multicast;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c12f361d718..d136a371e6b 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1864,7 +1864,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 	struct rate_selection rsel;
 	struct beacon_data *beacon;
 	struct ieee80211_supported_band *sband;
-	int *num_beacons;
 	enum ieee80211_band band = local->hw.conf.channel->band;
 
 	sband = local->hw.wiphy->bands[band];
@@ -1912,8 +1911,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 			if (beacon->tail)
 				memcpy(skb_put(skb, beacon->tail_len),
 				       beacon->tail, beacon->tail_len);
-
-			num_beacons = &ap->num_beacons;
 		} else
 			goto out;
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
@@ -1931,8 +1928,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 		hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 						 IEEE80211_STYPE_BEACON);
 
-		num_beacons = &ifsta->num_beacons;
-#ifdef CONFIG_MAC80211_MESH
 	} else if (ieee80211_vif_is_mesh(&sdata->vif)) {
 		struct ieee80211_mgmt *mgmt;
 		u8 *pos;
@@ -1960,9 +1955,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 		*pos++ = 0x0;
 
 		mesh_mgmt_ies_add(skb, sdata);
-
-		num_beacons = &sdata->u.mesh.num_beacons;
-#endif
 	} else {
 		WARN_ON(1);
 		goto out;
@@ -1999,7 +1991,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 	info->antenna_sel_tx = local->hw.conf.antenna_sel_tx;
 	info->control.retry_limit = 1;
 
-	(*num_beacons)++;
 out:
 	rcu_read_unlock();
 	return skb;
-- 
cgit v1.2.3


From 538df283c185c477dbdafafa9652c33e9742de75 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 12 Sep 2008 15:22:53 +0200
Subject: mac80211: remove debug frame dumping

You can just pull up a monitor interface to get much more
detailed information, or, when debugging a driver, insert
dump code into the driver (which usually you will have to
do anyway to dump the driver-specific information). Hence
this option is useless.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/Kconfig | 13 -------------
 net/mac80211/tx.c    | 42 ------------------------------------------
 2 files changed, 55 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 80d693392b0..8427518e4f2 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -179,19 +179,6 @@ config MAC80211_VERBOSE_MPL_DEBUG
 
 	  Do not select this option.
 
-config MAC80211_LOWTX_FRAME_DUMP
-	bool "Debug frame dumping"
-	depends on MAC80211_DEBUG_MENU
-	---help---
-	  Selecting this option will cause the stack to
-	  print a message for each frame that is handed
-	  to the lowlevel driver for transmission. This
-	  message includes all MAC addresses and the
-	  frame control field.
-
-	  If unsure, say N and insert the debugging code
-	  you require into the driver you are debugging.
-
 config MAC80211_DEBUG_COUNTERS
 	bool "Extra statistics for TX/RX debugging"
 	depends on MAC80211_DEBUG_MENU
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index d136a371e6b..20d683641b4 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,43 +38,6 @@
 
 /* misc utils */
 
-#ifdef CONFIG_MAC80211_LOWTX_FRAME_DUMP
-static void ieee80211_dump_frame(const char *ifname, const char *title,
-				 const struct sk_buff *skb)
-{
-	const struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
-	unsigned int hdrlen;
-	DECLARE_MAC_BUF(mac);
-
-	printk(KERN_DEBUG "%s: %s (len=%d)", ifname, title, skb->len);
-	if (skb->len < 4) {
-		printk("\n");
-		return;
-	}
-
-	hdrlen = ieee80211_hdrlen(hdr->frame_control);
-	if (hdrlen > skb->len)
-		hdrlen = skb->len;
-	if (hdrlen >= 4)
-		printk(" FC=0x%04x DUR=0x%04x",
-		    le16_to_cpu(hdr->frame_control), le16_to_cpu(hdr->duration_id));
-	if (hdrlen >= 10)
-		printk(" A1=%s", print_mac(mac, hdr->addr1));
-	if (hdrlen >= 16)
-		printk(" A2=%s", print_mac(mac, hdr->addr2));
-	if (hdrlen >= 24)
-		printk(" A3=%s", print_mac(mac, hdr->addr3));
-	if (hdrlen >= 30)
-		printk(" A4=%s", print_mac(mac, hdr->addr4));
-	printk("\n");
-}
-#else /* CONFIG_MAC80211_LOWTX_FRAME_DUMP */
-static inline void ieee80211_dump_frame(const char *ifname, const char *title,
-					struct sk_buff *skb)
-{
-}
-#endif /* CONFIG_MAC80211_LOWTX_FRAME_DUMP */
-
 static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr,
 				 int next_frag_len)
 {
@@ -1068,8 +1031,6 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
 			return IEEE80211_TX_AGAIN;
 		info =  IEEE80211_SKB_CB(skb);
 
-		ieee80211_dump_frame(wiphy_name(local->hw.wiphy),
-				     "TX to low-level driver", skb);
 		ret = local->ops->tx(local_to_hw(local), skb);
 		if (ret)
 			return IEEE80211_TX_AGAIN;
@@ -1099,9 +1060,6 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
 						~IEEE80211_TX_CTL_RATE_CTRL_PROBE;
 			}
 
-			ieee80211_dump_frame(wiphy_name(local->hw.wiphy),
-					     "TX to low-level driver",
-					     tx->extra_frag[i]);
 			ret = local->ops->tx(local_to_hw(local),
 					    tx->extra_frag[i]);
 			if (ret)
-- 
cgit v1.2.3


From e16ce63c893ff7ccb314d2fbdafbbc915b64d173 Mon Sep 17 00:00:00 2001
From: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Date: Fri, 12 Sep 2008 13:44:08 -0700
Subject: mac80211 : Fix mode change hard_start_xmit function

When monitor mode is changed to BSS or IBSS, data trasnfer can not happen
because proper transmit function is not assigend for BSS ,IBSS mode.
This patch fixes this problem by assigning the ieee80211_subif_start_xmit
to device's hard_start_xmit function.

Signed-off-by: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Acked-by: Zhu Yi <yi.zhu@intel.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/iface.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index a7ef0289fbd..a72fbebb8ea 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -624,6 +624,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 
 	/* and set some type-dependent values */
 	sdata->vif.type = type;
+	sdata->dev->hard_start_xmit = ieee80211_subif_start_xmit;
 
 	/* only monitor differs */
 	sdata->dev->type = ARPHRD_ETHER;
-- 
cgit v1.2.3


From 9e691ed68d94ab3047e028736641445b4cf74d67 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Wed, 17 Sep 2008 10:10:41 +1000
Subject: ipvs: only unlock in ip_vs_edit_service() if already locked

Jumping to out unlocks __ip_vs_svc_lock, but that lock is not taken until
after code that may jump to out.

This problem was detected by sparse.

make C=1
  CHECK   net/ipv4/ipvs/ip_vs_ctl.c
net/ipv4/ipvs/ip_vs_ctl.c:1332:2: warning: context imbalance in 'ip_vs_edit_service' - unexpected unlock

Acked-by: Sven Wegener <sven.wegener@stealer.net>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 993a83fb0d5..60ca24b9ec0 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1305,7 +1305,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 		 */
 		if ((ret = ip_vs_unbind_scheduler(svc))) {
 			old_sched = sched;
-			goto out;
+			goto out_unlock;
 		}
 
 		/*
@@ -1324,12 +1324,13 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 			 */
 			ip_vs_bind_scheduler(svc, old_sched);
 			old_sched = sched;
-			goto out;
+			goto out_unlock;
 		}
 	}
 
-  out:
+  out_unlock:
 	write_unlock_bh(&__ip_vs_svc_lock);
+  out:
 
 	if (old_sched)
 		ip_vs_scheduler_put(old_sched);
-- 
cgit v1.2.3


From dff630ddad3884b99fae3ad92f5eccbf26618679 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Wed, 17 Sep 2008 10:10:42 +1000
Subject: ipvs: supply a valid 0 address to ip_vs_conn_new()

ip_vs_conn_new expects a union nf_inet_addr as the type for its address
parameters, not a plain integer.

This problem was detected by sparse.

make C=1
  CHECK   net/ipv4/ipvs/ip_vs_core.c
net/ipv4/ipvs/ip_vs_core.c:469:9: warning: Using plain integer as NULL pointer

Acked-by: Sven Wegener <sven.wegener@stealer.net>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 80a4fcf33a5..ece748dbd0c 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -457,6 +457,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
+		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
 		ip_vs_service_put(svc);
 
@@ -465,7 +466,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		cp = ip_vs_conn_new(svc->af, iph.protocol,
 				    &iph.saddr, pptr[0],
 				    &iph.daddr, pptr[1],
-				    0, 0,
+				    &daddr, 0,
 				    IP_VS_CONN_F_BYPASS,
 				    NULL);
 		if (cp == NULL)
-- 
cgit v1.2.3


From 563e94f072714657a82a59a3bf81a719a6a25591 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Wed, 17 Sep 2008 10:10:42 +1000
Subject: ipvs: add __aquire/__release annotations to
 ip_vs_info_seq_start/ip_vs_info_seq_stop

This teaches sparse that the following are not problems:

make C=1
  CHECK   net/ipv4/ipvs/ip_vs_ctl.c
net/ipv4/ipvs/ip_vs_ctl.c:1793:14: warning: context imbalance in 'ip_vs_info_seq_start' - wrong count at exit
net/ipv4/ipvs/ip_vs_ctl.c:1842:13: warning: context imbalance in 'ip_vs_info_seq_stop' - unexpected unlock

Acked-by: Sven Wegener <sven.wegener@stealer.net>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 60ca24b9ec0..771551d8fba 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1787,6 +1787,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 }
 
 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+__acquires(__ip_vs_svc_lock)
 {
 
 	read_lock_bh(&__ip_vs_svc_lock);
@@ -1840,6 +1841,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+__releases(__ip_vs_svc_lock)
 {
 	read_unlock_bh(&__ip_vs_svc_lock);
 }
-- 
cgit v1.2.3


From d286600e199aa2f1058a1f883d234e73626304d2 Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Tue, 16 Sep 2008 11:11:11 -0400
Subject: ipvs: change some __constant_htons() to htons()

Change __contant_htons() to htons() in the IPVS code when not in an
initializer.

-Brian

Signed-off-by: Brian Haley <brian.haley@hp.com>
Acked-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_proto.c        | 2 +-
 net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
index b06da1c3445..0791f9e08fe 100644
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -237,7 +237,7 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
 			  const char *msg)
 {
 #ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == __constant_htons(ETH_P_IPV6))
+	if (skb->protocol == htons(ETH_P_IPV6))
 		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
 	else
 #endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
index 2b18a78d039..80ab0c8e5b4 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
@@ -167,7 +167,7 @@ ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 		    int offset, const char *msg)
 {
 #ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == __constant_htons(ETH_P_IPV6))
+	if (skb->protocol == htons(ETH_P_IPV6))
 		ah_esp_debug_packet_v6(pp, skb, offset, msg);
 	else
 #endif
-- 
cgit v1.2.3


From 821c92f258bd9b01eb900992969803645b6ba9d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi.denis-courmont@nokia.com>
Date: Thu, 18 Sep 2008 16:44:31 -0700
Subject: ISDN sockets: add missing lockdep strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 91f8bbc9352..23b8b9da36b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -154,7 +154,7 @@ static const char *af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
-  "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
+  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_MAX"
 };
 static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -168,7 +168,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
-  "slock-AF_RXRPC" , "slock-AF_MAX"
+  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_MAX"
 };
 static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -182,7 +182,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
-  "clock-AF_RXRPC" , "clock-AF_MAX"
+  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_MAX"
 };
 #endif
 
-- 
cgit v1.2.3


From 64edc2736e23994e0334b70c5ff08dc33e2ebbd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:18:32 -0700
Subject: tcp: Partial hint clearing has again become meaningless
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ie., the difference between partial and all clearing doesn't
exists anymore since the SACK optimizations got dropped by
an sacktag rewrite.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 5 ++---
 net/ipv4/tcp_output.c | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f79a5160729..7306bfb16cd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1883,7 +1883,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 	tp->high_seq = tp->snd_nxt;
 	TCP_ECN_queue_cwr(tp);
 
-	tcp_clear_retrans_hints_partial(tp);
+	tcp_clear_all_retrans_hints(tp);
 }
 
 static void tcp_clear_retrans_partial(struct tcp_sock *tp)
@@ -1934,12 +1934,11 @@ void tcp_enter_loss(struct sock *sk, int how)
 		/* Push undo marker, if it was plain RTO and nothing
 		 * was retransmitted. */
 		tp->undo_marker = tp->snd_una;
-		tcp_clear_retrans_hints_partial(tp);
 	} else {
 		tp->sacked_out = 0;
 		tp->fackets_out = 0;
-		tcp_clear_all_retrans_hints(tp);
 	}
+	tcp_clear_all_retrans_hints(tp);
 
 	tcp_for_write_queue(skb, sk) {
 		if (skb == tcp_send_head(sk))
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8165f5aa8c7..11490958a09 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -750,7 +750,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
 	BUG_ON(len > skb->len);
 
-	tcp_clear_retrans_hints_partial(tp);
+	tcp_clear_all_retrans_hints(tp);
 	nsize = skb_headlen(skb) - len;
 	if (nsize < 0)
 		nsize = 0;
@@ -1823,7 +1823,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
 	tp->packets_out -= tcp_skb_pcount(next_skb);
 
 	/* changed transmit queue under us so clear hints */
-	tcp_clear_retrans_hints_partial(tp);
+	tcp_clear_all_retrans_hints(tp);
 
 	sk_wmem_free_skb(sk, next_skb);
 }
-- 
cgit v1.2.3


From c8c213f20ce97c66fe2ff86f33814d1ca0f9d7ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:18:55 -0700
Subject: tcp: move tcp_verify_retransmit_hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7306bfb16cd..9e95ad637db 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -979,6 +979,19 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 	}
 }
 
+/* RFC: This is from the original, I doubt that this is necessary at all:
+ * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
+ * retransmitted past LOST markings in the first place? I'm not fully sure
+ * about undo and end of connection cases, which can cause R without L?
+ */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if ((tp->retransmit_skb_hint != NULL) &&
+	    before(TCP_SKB_CB(skb)->seq,
+		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+		tp->retransmit_skb_hint = NULL;
+}
+
 /* This procedure tags the retransmission queue when SACKs arrive.
  *
  * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -2156,19 +2169,6 @@ static int tcp_time_to_recover(struct sock *sk)
 	return 0;
 }
 
-/* RFC: This is from the original, I doubt that this is necessary at all:
- * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
- * retransmitted past LOST markings in the first place? I'm not fully sure
- * about undo and end of connection cases, which can cause R without L?
- */
-static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
-{
-	if ((tp->retransmit_skb_hint != NULL) &&
-	    before(TCP_SKB_CB(skb)->seq,
-		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
-		tp->retransmit_skb_hint = NULL;
-}
-
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
  * is against sacked "cnt", otherwise it's against facked "cnt"
  */
-- 
cgit v1.2.3


From 41ea36e35a0daa75377b3e70680e5c3a3f83fe27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:19:22 -0700
Subject: tcp: add helper for lost bit toggling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This useful because we'd need to verifying soon in many places
which makes things slightly more complex than it used to be.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9e95ad637db..12512336dbd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -992,6 +992,16 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 		tp->retransmit_skb_hint = NULL;
 }
 
+static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+		tcp_verify_retransmit_hint(tp, skb);
+
+		tp->lost_out += tcp_skb_pcount(skb);
+		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+	}
+}
+
 /* This procedure tags the retransmission queue when SACKs arrive.
  *
  * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -2216,11 +2226,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 			cnt = packets;
 		}
 
-		if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
-			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-			tp->lost_out += tcp_skb_pcount(skb);
-			tcp_verify_retransmit_hint(tp, skb);
-		}
+		tcp_skb_mark_lost(tp, skb);
 	}
 	tcp_verify_left_out(tp);
 }
@@ -2262,11 +2268,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 			if (!tcp_skb_timedout(sk, skb))
 				break;
 
-			if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
-				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-				tp->lost_out += tcp_skb_pcount(skb);
-				tcp_verify_retransmit_hint(tp, skb);
-			}
+			tcp_skb_mark_lost(tp, skb);
 		}
 
 		tp->scoreboard_skb_hint = skb;
-- 
cgit v1.2.3


From 006f582c73f4eda35e06fd323193c3df43fb3459 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:20:20 -0700
Subject: tcp: convert retransmit_cnt_hint to seqno
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Main benefit in this is that we can then freely point
the retransmit_skb_hint to anywhere we want to because
there's no longer need to know what would be the count
changes involve, and since this is really used only as a
terminator, unnecessary work is one time walk at most,
and if some retransmissions are necessary after that
point later on, the walk is not full waste of time
anyway.

Since retransmit_high must be kept valid, all lost
markers must ensure that.

Now I also have learned how those "holes" in the
rexmittable skbs can appear, mtu probe does them. So
I removed the misleading comment as well.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 34 ++++++++++++++++++++--------------
 net/ipv4/tcp_output.c | 25 +++++++------------------
 2 files changed, 27 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 12512336dbd..d271cc82500 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -979,17 +979,17 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 	}
 }
 
-/* RFC: This is from the original, I doubt that this is necessary at all:
- * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
- * retransmitted past LOST markings in the first place? I'm not fully sure
- * about undo and end of connection cases, which can cause R without L?
- */
+/* This must be called before lost_out is incremented */
 static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 {
-	if ((tp->retransmit_skb_hint != NULL) &&
+	if ((tp->retransmit_skb_hint == NULL) ||
 	    before(TCP_SKB_CB(skb)->seq,
 		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
-		tp->retransmit_skb_hint = NULL;
+		tp->retransmit_skb_hint = skb;
+
+	if (!tp->lost_out ||
+	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
+		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1002,6 +1002,16 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
 	}
 }
 
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	tcp_verify_retransmit_hint(tp, skb);
+
+	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+		tp->lost_out += tcp_skb_pcount(skb);
+		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+	}
+}
+
 /* This procedure tags the retransmission queue when SACKs arrive.
  *
  * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -1178,13 +1188,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 			tp->retrans_out -= tcp_skb_pcount(skb);
 
-			/* clear lost hint */
-			tp->retransmit_skb_hint = NULL;
-
-			if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
-				tp->lost_out += tcp_skb_pcount(skb);
-				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-			}
+			tcp_skb_mark_lost_uncond_verify(tp, skb);
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
 		} else {
 			if (before(ack_seq, new_low_seq))
@@ -1890,6 +1894,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
+			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
@@ -1974,6 +1979,7 @@ void tcp_enter_loss(struct sock *sk, int how)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
+			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 11490958a09..cfae61b40c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1838,7 +1838,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	unsigned int mss = tcp_current_mss(sk, 0);
-	int lost = 0;
+	u32 prior_lost = tp->lost_out;
 
 	tcp_for_write_queue(skb, sk) {
 		if (skb == tcp_send_head(sk))
@@ -1849,17 +1849,13 @@ void tcp_simple_retransmit(struct sock *sk)
 				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 				tp->retrans_out -= tcp_skb_pcount(skb);
 			}
-			if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
-				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-				tp->lost_out += tcp_skb_pcount(skb);
-				lost = 1;
-			}
+			tcp_skb_mark_lost_uncond_verify(tp, skb);
 		}
 	}
 
 	tcp_clear_all_retrans_hints(tp);
 
-	if (!lost)
+	if (prior_lost == tp->lost_out)
 		return;
 
 	if (tcp_is_reno(tp))
@@ -2009,15 +2005,11 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int packet_cnt;
 
-	if (tp->retransmit_skb_hint) {
+	if (tp->retransmit_skb_hint)
 		skb = tp->retransmit_skb_hint;
-		packet_cnt = tp->retransmit_cnt_hint;
-	} else {
+	else
 		skb = tcp_write_queue_head(sk);
-		packet_cnt = 0;
-	}
 
 	/* First pass: retransmit lost packets. */
 	if (tp->lost_out) {
@@ -2028,7 +2020,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 				break;
 			/* we could do better than to assign each time */
 			tp->retransmit_skb_hint = skb;
-			tp->retransmit_cnt_hint = packet_cnt;
 
 			/* Assume this retransmit will generate
 			 * only one packet for congestion window
@@ -2039,6 +2030,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 			 */
 			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 				return;
+			if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
+				break;
 
 			if (sacked & TCPCB_LOST) {
 				if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
@@ -2059,10 +2052,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 									  inet_csk(sk)->icsk_rto,
 									  TCP_RTO_MAX);
 				}
-
-				packet_cnt += tcp_skb_pcount(skb);
-				if (packet_cnt >= tp->lost_out)
-					break;
 			}
 		}
 	}
-- 
cgit v1.2.3


From f09142eddb75005e41b0af3e5214979d8b534b1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:20:50 -0700
Subject: tcp: Kill precaution that's very likely obsolete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I suspect it might have been related to the changed amount
of lost skbs, which was counted by retransmit_cnt_hint that
got changed.

The place for this clearing was very illogical anyway,
it should have been after the LOST-bit clearing loop to
make any sense.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d271cc82500..28e93f1e421 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2385,10 +2385,6 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
 	}
 	tcp_moderate_cwnd(tp);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
-
-	/* There is something screwy going on with the retrans hints after
-	   an undo */
-	tcp_clear_all_retrans_hints(tp);
 }
 
 static inline int tcp_may_undo(struct tcp_sock *tp)
-- 
cgit v1.2.3


From 184d68b2b0b836587f92887b14baea41033ffeef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:21:16 -0700
Subject: tcp: No need to clear retransmit_skb_hint when SACKing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because lost counter no longer requires tuning, this is
trivial to remove (the tuning wouldn't have been too
hard either) because no "new" retransmittable skb appeared
below retransmit_skb_hint when SACKing for sure.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28e93f1e421..d017aed6edd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1298,9 +1298,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 					~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
 				tp->lost_out -= tcp_skb_pcount(skb);
 				tp->retrans_out -= tcp_skb_pcount(skb);
-
-				/* clear lost hint */
-				tp->retransmit_skb_hint = NULL;
 			}
 		} else {
 			if (!(sacked & TCPCB_RETRANS)) {
@@ -1319,9 +1316,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 			if (sacked & TCPCB_LOST) {
 				TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
 				tp->lost_out -= tcp_skb_pcount(skb);
-
-				/* clear lost hint */
-				tp->retransmit_skb_hint = NULL;
 			}
 		}
 
@@ -1351,7 +1345,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 	if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) {
 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 		tp->retrans_out -= tcp_skb_pcount(skb);
-		tp->retransmit_skb_hint = NULL;
 	}
 
 	return flag;
-- 
cgit v1.2.3


From b5afe7bc71a1689376c9b547376d17568469f3b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:21:54 -0700
Subject: tcp: add tcp_can_forward_retransmit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cfae61b40c4..957c4e3d217 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1992,6 +1992,33 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	return err;
 }
 
+static int tcp_can_forward_retransmit(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Forward retransmissions are possible only during Recovery. */
+	if (icsk->icsk_ca_state != TCP_CA_Recovery)
+		return 0;
+
+	/* No forward retransmissions in Reno are possible. */
+	if (tcp_is_reno(tp))
+		return 0;
+
+	/* Yeah, we have to make difficult choice between forward transmission
+	 * and retransmission... Both ways have their merits...
+	 *
+	 * For now we do not retransmit anything, while we have some new
+	 * segments to send. In the other cases, follow rule 3 for
+	 * NextSeg() specified in RFC3517.
+	 */
+
+	if (tcp_may_send_now(sk))
+		return 0;
+
+	return 1;
+}
+
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2057,24 +2084,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	}
 
 	/* OK, demanded retransmission is finished. */
-
-	/* Forward retransmissions are possible only during Recovery. */
-	if (icsk->icsk_ca_state != TCP_CA_Recovery)
-		return;
-
-	/* No forward retransmissions in Reno are possible. */
-	if (tcp_is_reno(tp))
-		return;
-
-	/* Yeah, we have to make difficult choice between forward transmission
-	 * and retransmission... Both ways have their merits...
-	 *
-	 * For now we do not retransmit anything, while we have some new
-	 * segments to send. In the other cases, follow rule 3 for
-	 * NextSeg() specified in RFC3517.
-	 */
-
-	if (tcp_may_send_now(sk))
+	if (!tcp_can_forward_retransmit(sk))
 		return;
 
 	/* If nothing is SACKed, highest_sack in the loop won't be valid */
-- 
cgit v1.2.3


From 34638570b58290e8cb875fb24dcbe836ffeb6cb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:22:17 -0700
Subject: tcp: remove obsolete validity concern
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 957c4e3d217..6f2a3f4a1af 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2087,10 +2087,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	if (!tcp_can_forward_retransmit(sk))
 		return;
 
-	/* If nothing is SACKed, highest_sack in the loop won't be valid */
-	if (!tp->sacked_out)
-		return;
-
 	if (tp->forward_skb_hint)
 		skb = tp->forward_skb_hint;
 	else
-- 
cgit v1.2.3


From 61eb55f4db7eaf5fb2d5ec12981a8cda755bb0e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:22:59 -0700
Subject: tcp: Reorganize skb tagbit checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6f2a3f4a1af..2f24ecc3706 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2032,6 +2032,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	int mib_idx;
 
 	if (tp->retransmit_skb_hint)
 		skb = tp->retransmit_skb_hint;
@@ -2059,27 +2060,26 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 				return;
 			if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
 				break;
+			if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+				continue;
 
-			if (sacked & TCPCB_LOST) {
-				if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
-					int mib_idx;
-
-					if (tcp_retransmit_skb(sk, skb)) {
-						tp->retransmit_skb_hint = NULL;
-						return;
-					}
-					if (icsk->icsk_ca_state != TCP_CA_Loss)
-						mib_idx = LINUX_MIB_TCPFASTRETRANS;
-					else
-						mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
-					NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
-					if (skb == tcp_write_queue_head(sk))
-						inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-									  inet_csk(sk)->icsk_rto,
-									  TCP_RTO_MAX);
-				}
+			if (!(sacked & TCPCB_LOST))
+				continue;
+
+			if (tcp_retransmit_skb(sk, skb)) {
+				tp->retransmit_skb_hint = NULL;
+				return;
 			}
+			if (icsk->icsk_ca_state != TCP_CA_Loss)
+				mib_idx = LINUX_MIB_TCPFASTRETRANS;
+			else
+				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+			NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+			if (skb == tcp_write_queue_head(sk))
+				inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+							  inet_csk(sk)->icsk_rto,
+							  TCP_RTO_MAX);
 		}
 	}
 
-- 
cgit v1.2.3


From 08ebd1721ab8fd362e90ae17b461c07b23fa2824 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:23:49 -0700
Subject: tcp: remove tp->lost_out guard to make joining diff nicer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The validity of the retransmit_high must then be ensured
if no L'ed skb exits!

This makes a minor change to behavior, we now have to
iterate the head to find out that the loop terminates.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 75 ++++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2f24ecc3706..9f44be633ef 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2034,53 +2034,54 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct sk_buff *skb;
 	int mib_idx;
 
+	if (!tp->lost_out)
+		tp->retransmit_high = tp->snd_una;
+
 	if (tp->retransmit_skb_hint)
 		skb = tp->retransmit_skb_hint;
 	else
 		skb = tcp_write_queue_head(sk);
 
 	/* First pass: retransmit lost packets. */
-	if (tp->lost_out) {
-		tcp_for_write_queue_from(skb, sk) {
-			__u8 sacked = TCP_SKB_CB(skb)->sacked;
+	tcp_for_write_queue_from(skb, sk) {
+		__u8 sacked = TCP_SKB_CB(skb)->sacked;
 
-			if (skb == tcp_send_head(sk))
-				break;
-			/* we could do better than to assign each time */
-			tp->retransmit_skb_hint = skb;
-
-			/* Assume this retransmit will generate
-			 * only one packet for congestion window
-			 * calculation purposes.  This works because
-			 * tcp_retransmit_skb() will chop up the
-			 * packet to be MSS sized and all the
-			 * packet counting works out.
-			 */
-			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-				return;
-			if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
-				break;
-			if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
-				continue;
+		if (skb == tcp_send_head(sk))
+			break;
+		/* we could do better than to assign each time */
+		tp->retransmit_skb_hint = skb;
+
+		/* Assume this retransmit will generate
+		 * only one packet for congestion window
+		 * calculation purposes.  This works because
+		 * tcp_retransmit_skb() will chop up the
+		 * packet to be MSS sized and all the
+		 * packet counting works out.
+		 */
+		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+			return;
+		if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
+			break;
+		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+			continue;
 
-			if (!(sacked & TCPCB_LOST))
-				continue;
+		if (!(sacked & TCPCB_LOST))
+			continue;
 
-			if (tcp_retransmit_skb(sk, skb)) {
-				tp->retransmit_skb_hint = NULL;
-				return;
-			}
-			if (icsk->icsk_ca_state != TCP_CA_Loss)
-				mib_idx = LINUX_MIB_TCPFASTRETRANS;
-			else
-				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
-			NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
-			if (skb == tcp_write_queue_head(sk))
-				inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-							  inet_csk(sk)->icsk_rto,
-							  TCP_RTO_MAX);
+		if (tcp_retransmit_skb(sk, skb)) {
+			tp->retransmit_skb_hint = NULL;
+			return;
 		}
+		if (icsk->icsk_ca_state != TCP_CA_Loss)
+			mib_idx = LINUX_MIB_TCPFASTRETRANS;
+		else
+			mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+		if (skb == tcp_write_queue_head(sk))
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  inet_csk(sk)->icsk_rto,
+						  TCP_RTO_MAX);
 	}
 
 	/* OK, demanded retransmission is finished. */
-- 
cgit v1.2.3


From 0e1c54c2a405494281e0639aacc90db03b50ae77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:24:21 -0700
Subject: tcp: reorganize retransmit code loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both loops are quite similar, so they can be combined
with little effort. As a result, forward_skb_hint becomes
obsolete as well.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 79 +++++++++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f44be633ef..b5b4ddcdda4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2032,7 +2032,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	struct sk_buff *hole = NULL;
 	int mib_idx;
+	int fwd_rexmitting = 0;
 
 	if (!tp->lost_out)
 		tp->retransmit_high = tp->snd_una;
@@ -2049,7 +2051,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		if (skb == tcp_send_head(sk))
 			break;
 		/* we could do better than to assign each time */
-		tp->retransmit_skb_hint = skb;
+		if (hole == NULL)
+			tp->retransmit_skb_hint = skb;
 
 		/* Assume this retransmit will generate
 		 * only one packet for congestion window
@@ -2060,65 +2063,49 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		 */
 		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 			return;
-		if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
-			break;
-		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
-			continue;
-
-		if (!(sacked & TCPCB_LOST))
-			continue;
-
-		if (tcp_retransmit_skb(sk, skb)) {
-			tp->retransmit_skb_hint = NULL;
-			return;
-		}
-		if (icsk->icsk_ca_state != TCP_CA_Loss)
-			mib_idx = LINUX_MIB_TCPFASTRETRANS;
-		else
-			mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
-		NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
-		if (skb == tcp_write_queue_head(sk))
-			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-						  inet_csk(sk)->icsk_rto,
-						  TCP_RTO_MAX);
-	}
-
-	/* OK, demanded retransmission is finished. */
-	if (!tcp_can_forward_retransmit(sk))
-		return;
 
-	if (tp->forward_skb_hint)
-		skb = tp->forward_skb_hint;
-	else
-		skb = tcp_write_queue_head(sk);
+		if (fwd_rexmitting) {
+begin_fwd:
+			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+				break;
+			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
 
-	tcp_for_write_queue_from(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-		tp->forward_skb_hint = skb;
+		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+			if (!tcp_can_forward_retransmit(sk))
+				break;
+			/* Backtrack if necessary to non-L'ed skb */
+			if (hole != NULL) {
+				skb = hole;
+				hole = NULL;
+			}
+			fwd_rexmitting = 1;
+			goto begin_fwd;
 
-		if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-			break;
+		} else if (!(sacked & TCPCB_LOST)) {
+			if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
+				hole = skb;
+			continue;
 
-		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-			break;
+		} else {
+			if (icsk->icsk_ca_state != TCP_CA_Loss)
+				mib_idx = LINUX_MIB_TCPFASTRETRANS;
+			else
+				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+		}
 
-		if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
 			continue;
 
-		/* Ok, retransmit it. */
 		if (tcp_retransmit_skb(sk, skb)) {
-			tp->forward_skb_hint = NULL;
-			break;
+			tp->retransmit_skb_hint = NULL;
+			return;
 		}
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 
 		if (skb == tcp_write_queue_head(sk))
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 						  inet_csk(sk)->icsk_rto,
 						  TCP_RTO_MAX);
-
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS);
 	}
 }
 
-- 
cgit v1.2.3


From f0ceb0ed86b4792a4ed9d3438f5f7572e48f9803 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:24:49 -0700
Subject: tcp: remove retransmit_skb_hint clearing from failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This doesn't much sense here afaict, probably never has. Since
fragmenting and collapsing deal the hints by themselves, there
should be very little reason for the rexmit loop to do that.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b5b4ddcdda4..f900fae8b87 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2096,10 +2096,8 @@ begin_fwd:
 		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
 			continue;
 
-		if (tcp_retransmit_skb(sk, skb)) {
-			tp->retransmit_skb_hint = NULL;
+		if (tcp_retransmit_skb(sk, skb))
 			return;
-		}
 		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 
 		if (skb == tcp_write_queue_head(sk))
-- 
cgit v1.2.3


From ef9da47c7cc64d69526331f315e76b5680d4048f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:25:15 -0700
Subject: tcp: don't clear retransmit_skb_hint when not necessary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most importantly avoid doing it with cumulative ACK. Not clearing
means that we no longer need n^2 processing in resolution of each
fast recovery.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 4 +++-
 net/ipv4/tcp_output.c | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d017aed6edd..44a4fffc2cc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2925,7 +2925,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 
 		tcp_unlink_write_queue(skb, sk);
 		sk_wmem_free_skb(sk, skb);
-		tcp_clear_all_retrans_hints(tp);
+		tcp_clear_retrans_hints_partial(tp);
+		if (skb == tp->retransmit_skb_hint)
+			tp->retransmit_skb_hint = NULL;
 	}
 
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f900fae8b87..239cea7b6c0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -750,7 +750,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
 	BUG_ON(len > skb->len);
 
-	tcp_clear_all_retrans_hints(tp);
+	tcp_clear_retrans_hints_partial(tp);
 	nsize = skb_headlen(skb) - len;
 	if (nsize < 0)
 		nsize = 0;
@@ -1823,7 +1823,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
 	tp->packets_out -= tcp_skb_pcount(next_skb);
 
 	/* changed transmit queue under us so clear hints */
-	tcp_clear_all_retrans_hints(tp);
+	tcp_clear_retrans_hints_partial(tp);
+	if (next_skb == tp->retransmit_skb_hint)
+		tp->retransmit_skb_hint = skb;
 
 	sk_wmem_free_skb(sk, next_skb);
 }
@@ -1853,7 +1855,7 @@ void tcp_simple_retransmit(struct sock *sk)
 		}
 	}
 
-	tcp_clear_all_retrans_hints(tp);
+	tcp_clear_retrans_hints_partial(tp);
 
 	if (prior_lost == tp->lost_out)
 		return;
-- 
cgit v1.2.3


From 90638a04ad8484b6b6c567656fb3f6d0689e23da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:25:52 -0700
Subject: tcp: don't clear lost_skb_hint when not necessary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most importantly avoid doing it with cumulative ACK. However,
since we have lost_cnt_hint in the picture as well needing
adjustments, it's not as trivial as dealing with
retransmit_skb_hint (and cannot be done in the all place we
could trivially leave retransmit_skb_hint untouched).

With the previous patch, this should mostly remove O(n^2)
behavior while cumulative ACKs start flowing once rexmit
after a lossy round-trip made it through.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 44a4fffc2cc..85627f83665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2844,6 +2844,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 	int flag = 0;
 	u32 pkts_acked = 0;
 	u32 reord = tp->packets_out;
+	u32 prior_sacked = tp->sacked_out;
 	s32 seq_rtt = -1;
 	s32 ca_seq_rtt = -1;
 	ktime_t last_ackt = net_invalid_timestamp();
@@ -2925,9 +2926,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 
 		tcp_unlink_write_queue(skb, sk);
 		sk_wmem_free_skb(sk, skb);
-		tcp_clear_retrans_hints_partial(tp);
+		tp->scoreboard_skb_hint = NULL;
 		if (skb == tp->retransmit_skb_hint)
 			tp->retransmit_skb_hint = NULL;
+		if (skb == tp->lost_skb_hint)
+			tp->lost_skb_hint = NULL;
 	}
 
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
@@ -2946,6 +2949,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 			/* Non-retransmitted hole got filled? That's reordering */
 			if (reord < prior_fackets)
 				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+
+			/* No need to care for underflows here because
+			 * the lost_skb_hint gets NULLed if we're past it
+			 * (or something non-trivial happened)
+			 */
+			if (tcp_is_fack(tp))
+				tp->lost_cnt_hint -= pkts_acked;
+			else
+				tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
 		}
 
 		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
-- 
cgit v1.2.3


From 618d9f25548ba6fc3a9cd2ce5cd56f4f015b0635 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:26:22 -0700
Subject: tcp: back retransmit_high when it over-estimated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If lost skb is sacked, we might have nothing to retransmit
as high as the retransmit_high is pointing to, so place
it lower to avoid unnecessary walking.

This is mainly for the case where high L'ed skbs gets sacked.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 239cea7b6c0..8f9793a37b6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2035,16 +2035,22 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	struct sk_buff *hole = NULL;
+	u32 last_lost;
 	int mib_idx;
 	int fwd_rexmitting = 0;
 
 	if (!tp->lost_out)
 		tp->retransmit_high = tp->snd_una;
 
-	if (tp->retransmit_skb_hint)
+	if (tp->retransmit_skb_hint) {
 		skb = tp->retransmit_skb_hint;
-	else
+		last_lost = TCP_SKB_CB(skb)->end_seq;
+		if (after(last_lost, tp->retransmit_high))
+			last_lost = tp->retransmit_high;
+	} else {
 		skb = tcp_write_queue_head(sk);
+		last_lost = tp->snd_una;
+	}
 
 	/* First pass: retransmit lost packets. */
 	tcp_for_write_queue_from(skb, sk) {
@@ -2073,6 +2079,7 @@ begin_fwd:
 			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
 
 		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+			tp->retransmit_high = last_lost;
 			if (!tcp_can_forward_retransmit(sk))
 				break;
 			/* Backtrack if necessary to non-L'ed skb */
@@ -2089,6 +2096,7 @@ begin_fwd:
 			continue;
 
 		} else {
+			last_lost = TCP_SKB_CB(skb)->end_seq;
 			if (icsk->icsk_ca_state != TCP_CA_Loss)
 				mib_idx = LINUX_MIB_TCPFASTRETRANS;
 			else
-- 
cgit v1.2.3


From a574420ff46cff0245e6b0fce2e961aa2717743d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Sat, 20 Sep 2008 22:07:34 -0700
Subject: multiq: requeue should rewind the current_band

Currently dequeueing a packet and requeueing the same packet will cause a
different packet to be pulled on the next dequeue.  This change forces
requeue to rewind the current_band.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_multiq.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 5d9cd68e91d..915f3149dde 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -97,6 +97,7 @@ static int
 multiq_requeue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct Qdisc *qdisc;
+	struct multiq_sched_data *q = qdisc_priv(sch);
 	int ret;
 
 	qdisc = multiq_classify(skb, sch, &ret);
@@ -113,6 +114,10 @@ multiq_requeue(struct sk_buff *skb, struct Qdisc *sch)
 	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
 		sch->qstats.requeues++;
+		if (q->curband)
+			q->curband--;
+		else
+			q->curband = q->bands - 1;
 		return NET_XMIT_SUCCESS;
 	}
 	if (net_xmit_drop_count(ret))
-- 
cgit v1.2.3


From 6067804047b64dde89f4f133fc7eba48ee44107d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 20 Sep 2008 22:20:49 -0700
Subject: net: Use hton[sl]() instead of __constant_hton[sl]() where applicable

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c           |  2 +-
 net/atm/br2684.c               |  8 ++++----
 net/core/dev.c                 |  4 ++--
 net/ethernet/eth.c             |  2 +-
 net/ipv4/ipvs/ip_vs_core.c     |  4 ++--
 net/ipv6/ip6_tunnel.c          |  4 ++--
 net/mac80211/wme.c             |  2 +-
 net/sched/cls_flow.c           | 28 ++++++++++++++--------------
 net/sched/sch_dsmark.c         |  8 ++++----
 net/sched/sch_sfq.c            |  4 ++--
 net/sunrpc/xprtrdma/rpc_rdma.c |  4 ++--
 11 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 97688cdb550..8883e9c8a22 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -48,7 +48,7 @@ static int vlan_dev_rebuild_header(struct sk_buff *skb)
 
 	switch (veth->h_vlan_encapsulated_proto) {
 #ifdef CONFIG_INET
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 
 		/* TODO:  Confirm this will work with VLAN headers... */
 		return arp_find(veth->h_dest, skb);
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 8d9a6f15888..280de481edc 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -375,11 +375,11 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 			if (memcmp
 			    (skb->data + 6, ethertype_ipv6,
 			     sizeof(ethertype_ipv6)) == 0)
-				skb->protocol = __constant_htons(ETH_P_IPV6);
+				skb->protocol = htons(ETH_P_IPV6);
 			else if (memcmp
 				 (skb->data + 6, ethertype_ipv4,
 				  sizeof(ethertype_ipv4)) == 0)
-				skb->protocol = __constant_htons(ETH_P_IP);
+				skb->protocol = htons(ETH_P_IP);
 			else
 				goto error;
 			skb_pull(skb, sizeof(llc_oui_ipv4));
@@ -404,9 +404,9 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 			skb_reset_network_header(skb);
 			iph = ip_hdr(skb);
 			if (iph->version == 4)
-				skb->protocol = __constant_htons(ETH_P_IP);
+				skb->protocol = htons(ETH_P_IP);
 			else if (iph->version == 6)
-				skb->protocol = __constant_htons(ETH_P_IPV6);
+				skb->protocol = htons(ETH_P_IPV6);
 			else
 				goto error;
 			skb->pkt_type = PACKET_HOST;
diff --git a/net/core/dev.c b/net/core/dev.c
index f48d1b24f9c..fdfc4b6a644 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1675,13 +1675,13 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
 	}
 
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		ip_proto = ip_hdr(skb)->protocol;
 		addr1 = ip_hdr(skb)->saddr;
 		addr2 = ip_hdr(skb)->daddr;
 		ihl = ip_hdr(skb)->ihl;
 		break;
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		ip_proto = ipv6_hdr(skb)->nexthdr;
 		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
 		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index a80839b02e3..647a9edee37 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -129,7 +129,7 @@ int eth_rebuild_header(struct sk_buff *skb)
 
 	switch (eth->h_proto) {
 #ifdef CONFIG_INET
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		return arp_find(eth->h_dest, skb);
 #endif
 	default:
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index ece748dbd0c..958abf3e5f8 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -938,7 +938,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 
 	EnterFunction(11);
 
-	af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
@@ -1258,7 +1258,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	struct ip_vs_conn *cp;
 	int ret, restart, af;
 
-	af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 17c7b098cdb..64ce3d33d9c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1050,10 +1050,10 @@ ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		ret = ip4ip6_tnl_xmit(skb, dev);
 		break;
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		ret = ip6ip6_tnl_xmit(skb, dev);
 		break;
 	default:
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 6748dedcab5..c703f8b44e9 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -39,7 +39,7 @@ static unsigned int classify_1d(struct sk_buff *skb)
 		return skb->priority - 256;
 
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		dscp = ip_hdr(skb)->tos & 0xfc;
 		break;
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 8f63a1a9401..0ebaff637e3 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -67,9 +67,9 @@ static inline u32 addr_fold(void *addr)
 static u32 flow_get_src(const struct sk_buff *skb)
 {
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		return ntohl(ip_hdr(skb)->saddr);
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
 	default:
 		return addr_fold(skb->sk);
@@ -79,9 +79,9 @@ static u32 flow_get_src(const struct sk_buff *skb)
 static u32 flow_get_dst(const struct sk_buff *skb)
 {
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		return ntohl(ip_hdr(skb)->daddr);
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
 	default:
 		return addr_fold(skb->dst) ^ (__force u16)skb->protocol;
@@ -91,9 +91,9 @@ static u32 flow_get_dst(const struct sk_buff *skb)
 static u32 flow_get_proto(const struct sk_buff *skb)
 {
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		return ip_hdr(skb)->protocol;
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		return ipv6_hdr(skb)->nexthdr;
 	default:
 		return 0;
@@ -120,7 +120,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb)
 	u32 res = 0;
 
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP): {
+	case htons(ETH_P_IP): {
 		struct iphdr *iph = ip_hdr(skb);
 
 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
@@ -128,7 +128,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb)
 			res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4));
 		break;
 	}
-	case __constant_htons(ETH_P_IPV6): {
+	case htons(ETH_P_IPV6): {
 		struct ipv6hdr *iph = ipv6_hdr(skb);
 
 		if (has_ports(iph->nexthdr))
@@ -147,7 +147,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb)
 	u32 res = 0;
 
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP): {
+	case htons(ETH_P_IP): {
 		struct iphdr *iph = ip_hdr(skb);
 
 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
@@ -155,7 +155,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb)
 			res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2));
 		break;
 	}
-	case __constant_htons(ETH_P_IPV6): {
+	case htons(ETH_P_IPV6): {
 		struct ipv6hdr *iph = ipv6_hdr(skb);
 
 		if (has_ports(iph->nexthdr))
@@ -213,9 +213,9 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
 static u32 flow_get_nfct_src(const struct sk_buff *skb)
 {
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		return ntohl(CTTUPLE(skb, src.u3.ip));
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
 	}
 fallback:
@@ -225,9 +225,9 @@ fallback:
 static u32 flow_get_nfct_dst(const struct sk_buff *skb)
 {
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		return ntohl(CTTUPLE(skb, dst.u3.ip));
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
 	}
 fallback:
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index edd1298f85f..ba43aab3a85 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -202,7 +202,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	if (p->set_tc_index) {
 		switch (skb->protocol) {
-		case __constant_htons(ETH_P_IP):
+		case htons(ETH_P_IP):
 			if (skb_cow_head(skb, sizeof(struct iphdr)))
 				goto drop;
 
@@ -210,7 +210,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 				& ~INET_ECN_MASK;
 			break;
 
-		case __constant_htons(ETH_P_IPV6):
+		case htons(ETH_P_IPV6):
 			if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
 				goto drop;
 
@@ -289,11 +289,11 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 	pr_debug("index %d->%d\n", skb->tc_index, index);
 
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 		ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
 				    p->value[index]);
 			break;
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 		ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
 				    p->value[index]);
 			break;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 6e041d10dbd..fe1508ef0d3 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -119,7 +119,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 	u32 h, h2;
 
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case htons(ETH_P_IP):
 	{
 		const struct iphdr *iph = ip_hdr(skb);
 		h = iph->daddr;
@@ -134,7 +134,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 			h2 ^= *(((u32*)iph) + iph->ihl);
 		break;
 	}
-	case __constant_htons(ETH_P_IPV6):
+	case htons(ETH_P_IPV6):
 	{
 		struct ipv6hdr *iph = ipv6_hdr(skb);
 		h = iph->daddr.s6_addr32[3];
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e55427f73df..5c1954d28d0 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -769,7 +769,7 @@ repost:
 	/* check for expected message types */
 	/* The order of some of these tests is important. */
 	switch (headerp->rm_type) {
-	case __constant_htonl(RDMA_MSG):
+	case htonl(RDMA_MSG):
 		/* never expect read chunks */
 		/* never expect reply chunks (two ways to check) */
 		/* never expect write chunks without having offered RDMA */
@@ -802,7 +802,7 @@ repost:
 		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
 		break;
 
-	case __constant_htonl(RDMA_NOMSG):
+	case htonl(RDMA_NOMSG):
 		/* never expect read or write chunks, always reply chunks */
 		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
 		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
-- 
cgit v1.2.3


From f5fff5dc8a7a3f395b0525c02ba92c95d42b7390 Mon Sep 17 00:00:00 2001
From: Tom Quetchenbach <virtualphtn@gmail.com>
Date: Sun, 21 Sep 2008 00:21:51 -0700
Subject: tcp: advertise MSS requested by user

I'm trying to use the TCP_MAXSEG option to setsockopt() to set the MSS
for both sides of a bidirectional connection.

man tcp says: "If this option is set before connection establishment, it
also changes the MSS value announced to the other end in the initial
packet."

However, the kernel only uses the MTU/route cache to set the advertised
MSS. That means if I set the MSS to, say, 500 before calling connect(),
I will send at most 500-byte packets, but I will still receive 1500-byte
packets in reply.

This is a bug, either in the kernel or the documentation.

This patch (applies to latest net-2.6) reduces the advertised value to
that requested by the user as long as setsockopt() is called before
connect() or accept(). This seems like the behavior that one would
expect as well as that which is documented.

I've tried to make sure that things that depend on the advertised MSS
are set correctly.

Signed-off-by: Tom Quetchenbach <virtualphtn@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c   |  4 ++++
 net/ipv4/tcp_output.c | 13 ++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3dfbc21e555..44aef1c1f37 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1364,6 +1364,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	tcp_mtup_init(newsk);
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+	if (tcp_sk(sk)->rx_opt.user_mss &&
+	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
 	tcp_initialize_rcv_mss(newsk);
 
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f9793a37b6..c3d58ee3e16 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2232,6 +2232,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	struct sk_buff *skb;
 	struct tcp_md5sig_key *md5;
 	__u8 *md5_hash_location;
+	int mss;
 
 	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
 	if (skb == NULL)
@@ -2242,13 +2243,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	skb->dst = dst_clone(dst);
 
+	mss = dst_metric(dst, RTAX_ADVMSS);
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+		mss = tp->rx_opt.user_mss;
+
 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 		__u8 rcv_wscale;
 		/* Set this up on the first call only */
 		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
 		/* tcp_full_space because it is guaranteed to be the first packet */
 		tcp_select_initial_window(tcp_full_space(sk),
-			dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
 			&req->rcv_wnd,
 			&req->window_clamp,
 			ireq->wscale_ok,
@@ -2258,8 +2263,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	memset(&opts, 0, sizeof(opts));
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	tcp_header_size = tcp_synack_options(sk, req,
-					     dst_metric(dst, RTAX_ADVMSS),
+	tcp_header_size = tcp_synack_options(sk, req, mss,
 					     skb, &opts, &md5) +
 			  sizeof(struct tcphdr);
 
@@ -2333,6 +2337,9 @@ static void tcp_connect_init(struct sock *sk)
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
+		tp->advmss = tp->rx_opt.user_mss;
+
 	tcp_initialize_rcv_mss(sk);
 
 	tcp_select_initial_window(tcp_full_space(sk),
-- 
cgit v1.2.3


From e6f225ebb7c35fe30fdf8608927c5cf8fce6de7d Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Fri, 19 Sep 2008 20:41:56 +0200
Subject: ipvs: Restrict sync message to 255 connections

The nr_conns variable in the sync message header is only eight bits wide
and will overflow on interfaces with a large MTU. As a result the backup
won't parse all connections contained in the sync buffer. On regular
ethernet with an MTU of 1500 this isn't a problem, because we can't
overflow the value, but consider jumbo frames being used on a cross-over
connection between both directors.

We now restrict the size of the sync buffer, so that we never put more
than 255 connections into a single sync buffer.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_sync.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 28237a5f62e..de5e7e118ee 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -30,6 +30,7 @@
 #include <linux/err.h>
 #include <linux/kthread.h>
 #include <linux/wait.h>
+#include <linux/kernel.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
@@ -99,6 +100,7 @@ struct ip_vs_sync_thread_data {
 */
 
 #define SYNC_MESG_HEADER_LEN	4
+#define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
 
 struct ip_vs_sync_mesg {
 	__u8                    nr_conns;
@@ -516,8 +518,8 @@ static int set_sync_mesg_maxlen(int sync_state)
 		num = (dev->mtu - sizeof(struct iphdr) -
 		       sizeof(struct udphdr) -
 		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-		sync_send_mesg_maxlen =
-			SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
+		sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
 		IP_VS_DBG(7, "setting the maximum length of sync sending "
 			  "message %d.\n", sync_send_mesg_maxlen);
 	} else if (sync_state == IP_VS_STATE_BACKUP) {
-- 
cgit v1.2.3


From 8d5803bf6fbe5264000afc8c34bff08e8ecc023b Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Sat, 20 Sep 2008 11:48:33 +0200
Subject: ipvs: Fix unused label warning

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/ipvs/ip_vs_ctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 771551d8fba..0302cf3e503 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1330,7 +1330,9 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 
   out_unlock:
 	write_unlock_bh(&__ip_vs_svc_lock);
+#ifdef CONFIG_IP_VS_IPV6
   out:
+#endif
 
 	if (old_sched)
 		ip_vs_scheduler_put(old_sched);
-- 
cgit v1.2.3


From 43f59c89399fd76883a06c551f24794e98409432 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 21 Sep 2008 21:28:51 -0700
Subject: net: Remove __skb_insert() calls outside of skbuff internals.

This minor cleanup simplifies later changes which will convert
struct sk_buff and friends over to using struct list_head.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 net/sctp/ulpqueue.c  | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 85627f83665..cbfe13d5f42 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4156,7 +4156,7 @@ drop:
 				skb1 = skb1->prev;
 			}
 		}
-		__skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
+		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 
 		/* And clean segments covered by new one as whole. */
 		while ((skb1 = skb->next) !=
@@ -4254,7 +4254,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		memcpy(nskb->head, skb->head, header);
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		__skb_insert(nskb, skb->prev, skb, list);
+		__skb_queue_before(list, skb, nskb);
 		skb_set_owner_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 5061a26c502..7b23803343c 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -317,7 +317,7 @@ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
 	}
 
 	/* Insert before pos. */
-	__skb_insert(sctp_event2skb(event), pos->prev, pos, &ulpq->reasm);
+	__skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
 
 }
 
@@ -825,8 +825,7 @@ static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
 
 
 	/* Insert before pos. */
-	__skb_insert(sctp_event2skb(event), pos->prev, pos, &ulpq->lobby);
-
+	__skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
 }
 
 static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
-- 
cgit v1.2.3


From 2cdc55751c33829f00510e0104562d0f8d8a9b85 Mon Sep 17 00:00:00 2001
From: Kaihui Luo <kaih.luo@gmail.com>
Date: Mon, 22 Sep 2008 19:02:36 -0700
Subject: netfilter: xt_time gives a wrong monthday in a leap year

The function localtime_3 in xt_time.c gives a wrong monthday in a leap
year after 28th 2.  calculating monthday should use the array
days_since_leapyear[] not days_since_year[] in a leap year.

Signed-off-by: Kaihui Luo <kaih.luo@gmail.com>
Acked-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/xt_time.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 9f328593287..307a2c3c2df 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -136,17 +136,19 @@ static void localtime_3(struct xtm *r, time_t time)
 	 * from w repeatedly while counting.)
 	 */
 	if (is_leap(year)) {
+		/* use days_since_leapyear[] in a leap year */
 		for (i = ARRAY_SIZE(days_since_leapyear) - 1;
-		    i > 0 && days_since_year[i] > w; --i)
+		    i > 0 && days_since_leapyear[i] > w; --i)
 			/* just loop */;
+		r->monthday = w - days_since_leapyear[i] + 1;
 	} else {
 		for (i = ARRAY_SIZE(days_since_year) - 1;
 		    i > 0 && days_since_year[i] > w; --i)
 			/* just loop */;
+		r->monthday = w - days_since_year[i] + 1;
 	}
 
 	r->month    = i + 1;
-	r->monthday = w - days_since_year[i] + 1;
 	return;
 }
 
-- 
cgit v1.2.3


From d48abfecea8513cfd2fd7e341439c1b8a28e9ff4 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 22 Sep 2008 19:20:51 -0700
Subject: net: em_cmp.c use unaligned access helpers

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_cmp.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index cc49c932641..bc450397487 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/tc_ematch/tc_em_cmp.h>
+#include <asm/unaligned.h>
 #include <net/pkt_cls.h>
 
 static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
@@ -37,8 +38,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
 			break;
 
 		case TCF_EM_ALIGN_U16:
-			val = *ptr << 8;
-			val |= *(ptr+1);
+			val = get_unaligned_be16(ptr);
 
 			if (cmp_needs_transformation(cmp))
 				val = be16_to_cpu(val);
@@ -47,10 +47,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
 		case TCF_EM_ALIGN_U32:
 			/* Worth checking boundries? The branching seems
 			 * to get worse. Visit again. */
-			val = *ptr << 24;
-			val |= *(ptr+1) << 16;
-			val |= *(ptr+2) << 8;
-			val |= *(ptr+3);
+			val = get_unaligned_be32(ptr);
 
 			if (cmp_needs_transformation(cmp))
 				val = be32_to_cpu(val);
-- 
cgit v1.2.3


From 5e687220a047dc4f0c2fb9ce886359a23075ddbc Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 22 Sep 2008 19:24:45 -0700
Subject: net/atm/lec.c: drop code after return

The break after the return serves no purpose.

Signed-off-by: Julia Lawall <julia@diku.dk>
Reviewed-by: Richard Genoud <richard.genoud@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/lec.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5799fb52365..8f701cde594 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1931,7 +1931,6 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
 		switch (priv->lane_version) {
 		case 1:
 			return priv->mcast_vcc;
-			break;
 		case 2:	/* LANE2 wants arp for multicast addresses */
 			if (!compare_ether_addr(mac_to_find, bus_mac))
 				return priv->mcast_vcc;
-- 
cgit v1.2.3


From 5c1824587f0797373c95719a196f6098f7c6d20c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 22 Sep 2008 19:48:19 -0700
Subject: ipsec: Fix xfrm_state_walk race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As discovered by Timo Teräs, the currently xfrm_state_walk scheme
is racy because if a second dump finishes before the first, we
may free xfrm states that the first dump would walk over later.

This patch fixes this by storing the dumps in a list in order
to calculate the correct completion counter which cures this
problem.

I've expanded netlink_cb in order to accomodate the extra state
related to this.  It shouldn't be a big deal since netlink_cb
is kmalloced for each dump and we're just increasing it by 4 or
8 bytes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_state.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index abbe2702c40..053970e8765 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -64,6 +64,9 @@ static unsigned long xfrm_state_walk_ongoing;
 /* Counter indicating walk completion, protected by xfrm_cfg_mutex. */
 static unsigned long xfrm_state_walk_completed;
 
+/* List of outstanding state walks used to set the completed counter.  */
+static LIST_HEAD(xfrm_state_walks);
+
 static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
 
@@ -1584,7 +1587,6 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
 			if (err) {
 				xfrm_state_hold(last);
 				walk->state = last;
-				xfrm_state_walk_ongoing++;
 				goto out;
 			}
 		}
@@ -1599,25 +1601,44 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
 		err = func(last, 0, data);
 out:
 	spin_unlock_bh(&xfrm_state_lock);
-	if (old != NULL) {
+	if (old != NULL)
 		xfrm_state_put(old);
-		xfrm_state_walk_completed++;
-		if (!list_empty(&xfrm_state_gc_leftovers))
-			schedule_work(&xfrm_state_gc_work);
-	}
 	return err;
 }
 EXPORT_SYMBOL(xfrm_state_walk);
 
+void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
+{
+	walk->proto = proto;
+	walk->state = NULL;
+	walk->count = 0;
+	list_add_tail(&walk->list, &xfrm_state_walks);
+	walk->genid = ++xfrm_state_walk_ongoing;
+}
+EXPORT_SYMBOL(xfrm_state_walk_init);
+
 void xfrm_state_walk_done(struct xfrm_state_walk *walk)
 {
+	struct list_head *prev;
+
 	if (walk->state != NULL) {
 		xfrm_state_put(walk->state);
 		walk->state = NULL;
-		xfrm_state_walk_completed++;
-		if (!list_empty(&xfrm_state_gc_leftovers))
-			schedule_work(&xfrm_state_gc_work);
 	}
+
+	prev = walk->list.prev;
+	list_del(&walk->list);
+
+	if (prev != &xfrm_state_walks) {
+		list_entry(prev, struct xfrm_state_walk, list)->genid =
+			walk->genid;
+		return;
+	}
+
+	xfrm_state_walk_completed = walk->genid;
+
+	if (!list_empty(&xfrm_state_gc_leftovers))
+		schedule_work(&xfrm_state_gc_work);
 }
 EXPORT_SYMBOL(xfrm_state_walk_done);
 
-- 
cgit v1.2.3


From bce7b15426cac3000bf6a9cf59d9356ef0be2dec Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 19:51:15 -0700
Subject: Phonet: global definitions

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 23b8b9da36b..2d358dd8a03 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -154,7 +154,8 @@ static const char *af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
-  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_MAX"
+  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
+  "sk_lock-AF_MAX"
 };
 static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -168,7 +169,8 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
-  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_MAX"
+  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
+  "slock-AF_MAX"
 };
 static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -182,7 +184,8 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
-  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_MAX"
+  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
+  "clock-AF_MAX"
 };
 #endif
 
-- 
cgit v1.2.3


From 4b07b3f69a8471cdc142c51461a331226fef248a Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:02:10 -0700
Subject: Phonet: PF_PHONET protocol family support

This is the basis for the Phonet protocol families, and introduces
the ETH_P_PHONET packet type and the PF_PHONET socket family.

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/af_phonet.c | 216 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 net/phonet/af_phonet.c

(limited to 'net')

diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
new file mode 100644
index 00000000000..0cfea9bc994
--- /dev/null
+++ b/net/phonet/af_phonet.c
@@ -0,0 +1,216 @@
+/*
+ * File: af_phonet.c
+ *
+ * Phonet protocols family
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/unaligned.h>
+#include <net/sock.h>
+
+#include <linux/if_phonet.h>
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+
+static struct net_proto_family phonet_proto_family;
+static struct phonet_protocol *phonet_proto_get(int protocol);
+static inline void phonet_proto_put(struct phonet_protocol *pp);
+
+/* protocol family functions */
+
+static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
+{
+	struct phonet_protocol *pnp;
+	int err;
+
+	if (net != &init_net)
+		return -EAFNOSUPPORT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (protocol == 0) {
+		/* Default protocol selection */
+		switch (sock->type) {
+		case SOCK_DGRAM:
+			protocol = PN_PROTO_PHONET;
+			break;
+		default:
+			return -EPROTONOSUPPORT;
+		}
+	}
+
+	pnp = phonet_proto_get(protocol);
+	if (pnp == NULL)
+		return -EPROTONOSUPPORT;
+	if (sock->type != pnp->sock_type) {
+		err = -EPROTONOSUPPORT;
+		goto out;
+	}
+
+	/* TODO: create and init the struct sock */
+	err = -EPROTONOSUPPORT;
+
+out:
+	phonet_proto_put(pnp);
+	return err;
+}
+
+static struct net_proto_family phonet_proto_family = {
+	.family = AF_PHONET,
+	.create = pn_socket_create,
+	.owner = THIS_MODULE,
+};
+
+/* packet type functions */
+
+/*
+ * Stuff received packets to associated sockets.
+ * On error, returns non-zero and releases the skb.
+ */
+static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
+			struct packet_type *pkttype,
+			struct net_device *orig_dev)
+{
+	struct phonethdr *ph;
+	struct sockaddr_pn sa;
+	u16 len;
+
+	if (dev_net(dev) != &init_net)
+		goto out;
+
+	/* check we have at least a full Phonet header */
+	if (!pskb_pull(skb, sizeof(struct phonethdr)))
+		goto out;
+
+	/* check that the advertised length is correct */
+	ph = pn_hdr(skb);
+	len = get_unaligned_be16(&ph->pn_length);
+	if (len < 2)
+		goto out;
+	len -= 2;
+	if ((len > skb->len) || pskb_trim(skb, len))
+		goto out;
+	skb_reset_transport_header(skb);
+
+	pn_skb_get_dst_sockaddr(skb, &sa);
+	if (pn_sockaddr_get_addr(&sa) == 0)
+		goto out; /* currently, we cannot be device 0 */
+
+	/* TODO: put packets to sockets backlog */
+
+out:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static struct packet_type phonet_packet_type = {
+	.type = __constant_htons(ETH_P_PHONET),
+	.dev = NULL,
+	.func = phonet_rcv,
+};
+
+/* Transport protocol registration */
+static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static DEFINE_SPINLOCK(proto_tab_lock);
+
+int __init_or_module phonet_proto_register(int protocol,
+						struct phonet_protocol *pp)
+{
+	int err = 0;
+
+	if (protocol >= PHONET_NPROTO)
+		return -EINVAL;
+
+	err = proto_register(pp->prot, 1);
+	if (err)
+		return err;
+
+	spin_lock(&proto_tab_lock);
+	if (proto_tab[protocol])
+		err = -EBUSY;
+	else
+		proto_tab[protocol] = pp;
+	spin_unlock(&proto_tab_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(phonet_proto_register);
+
+void phonet_proto_unregister(int protocol, struct phonet_protocol *pp)
+{
+	spin_lock(&proto_tab_lock);
+	BUG_ON(proto_tab[protocol] != pp);
+	proto_tab[protocol] = NULL;
+	spin_unlock(&proto_tab_lock);
+	proto_unregister(pp->prot);
+}
+EXPORT_SYMBOL(phonet_proto_unregister);
+
+static struct phonet_protocol *phonet_proto_get(int protocol)
+{
+	struct phonet_protocol *pp;
+
+	if (protocol >= PHONET_NPROTO)
+		return NULL;
+
+	spin_lock(&proto_tab_lock);
+	pp = proto_tab[protocol];
+	if (pp && !try_module_get(pp->prot->owner))
+		pp = NULL;
+	spin_unlock(&proto_tab_lock);
+
+	return pp;
+}
+
+static inline void phonet_proto_put(struct phonet_protocol *pp)
+{
+	module_put(pp->prot->owner);
+}
+
+/* Module registration */
+static int __init phonet_init(void)
+{
+	int err;
+
+	err = sock_register(&phonet_proto_family);
+	if (err) {
+		printk(KERN_ALERT
+			"phonet protocol family initialization failed\n");
+		return err;
+	}
+
+	dev_add_pack(&phonet_packet_type);
+	return 0;
+}
+
+static void __exit phonet_exit(void)
+{
+	sock_unregister(AF_PHONET);
+	dev_remove_pack(&phonet_packet_type);
+}
+
+module_init(phonet_init);
+module_exit(phonet_exit);
+MODULE_DESCRIPTION("Phonet protocol stack for Linux");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 8ead536dec142f27d5b5f72c3994eb39f4741717 Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:03:00 -0700
Subject: Phonet: add CONFIG_PHONET

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Kconfig         |  1 +
 net/Makefile        |  1 +
 net/phonet/Kconfig  | 16 ++++++++++++++++
 net/phonet/Makefile |  4 ++++
 4 files changed, 22 insertions(+)
 create mode 100644 net/phonet/Kconfig
 create mode 100644 net/phonet/Makefile

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index d87de48ba65..9103a16a77b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -232,6 +232,7 @@ source "net/can/Kconfig"
 source "net/irda/Kconfig"
 source "net/bluetooth/Kconfig"
 source "net/rxrpc/Kconfig"
+source "net/phonet/Kconfig"
 
 config FIB_RULES
 	bool
diff --git a/net/Makefile b/net/Makefile
index 4f43e7f874f..acaf819f24a 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_AF_RXRPC)		+= rxrpc/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_DECNET)		+= decnet/
 obj-$(CONFIG_ECONET)		+= econet/
+obj-$(CONFIG_PHONET)		+= phonet/
 ifneq ($(CONFIG_VLAN_8021Q),)
 obj-y				+= 8021q/
 endif
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
new file mode 100644
index 00000000000..51a5669573f
--- /dev/null
+++ b/net/phonet/Kconfig
@@ -0,0 +1,16 @@
+#
+# Phonet protocol
+#
+
+config PHONET
+	tristate "Phonet protocols family"
+	help
+	  The Phone Network protocol (PhoNet) is a packet-oriented
+	  communication protocol developped by Nokia for use with its modems.
+
+	  This is required for Maemo to use cellular data connectivity (if
+	  supported). It can also be used to control Nokia phones
+	  from a Linux computer, although AT commands may be easier to use.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called phonet. If unsure, say N.
diff --git a/net/phonet/Makefile b/net/phonet/Makefile
new file mode 100644
index 00000000000..5dbff68a6f3
--- /dev/null
+++ b/net/phonet/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_PHONET) += phonet.o
+
+phonet-objs := \
+	af_phonet.o
-- 
cgit v1.2.3


From f8ff60283de2b6775d7a14619056a08e3083bd40 Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:03:44 -0700
Subject: Phonet: network device and address handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This provides support for adding Phonet addresses to and removing
Phonet addresses from network devices.

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/Makefile    |   1 +
 net/phonet/af_phonet.c |   3 +
 net/phonet/pn_dev.c    | 208 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 212 insertions(+)
 create mode 100644 net/phonet/pn_dev.c

(limited to 'net')

diff --git a/net/phonet/Makefile b/net/phonet/Makefile
index 5dbff68a6f3..980a3866c9a 100644
--- a/net/phonet/Makefile
+++ b/net/phonet/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_PHONET) += phonet.o
 
 phonet-objs := \
+	pn_dev.o \
 	af_phonet.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 0cfea9bc994..a8ba6f177b2 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -31,6 +31,7 @@
 #include <linux/if_phonet.h>
 #include <linux/phonet.h>
 #include <net/phonet/phonet.h>
+#include <net/phonet/pn_dev.h>
 
 static struct net_proto_family phonet_proto_family;
 static struct phonet_protocol *phonet_proto_get(int protocol);
@@ -200,6 +201,7 @@ static int __init phonet_init(void)
 		return err;
 	}
 
+	phonet_device_init();
 	dev_add_pack(&phonet_packet_type);
 	return 0;
 }
@@ -208,6 +210,7 @@ static void __exit phonet_exit(void)
 {
 	sock_unregister(AF_PHONET);
 	dev_remove_pack(&phonet_packet_type);
+	phonet_device_exit();
 }
 
 module_init(phonet_init);
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
new file mode 100644
index 00000000000..53be9fc82aa
--- /dev/null
+++ b/net/phonet/pn_dev.c
@@ -0,0 +1,208 @@
+/*
+ * File: pn_dev.c
+ *
+ * Phonet network device
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/phonet.h>
+#include <net/sock.h>
+#include <net/phonet/pn_dev.h>
+
+/* when accessing, remember to lock with spin_lock(&pndevs.lock); */
+struct phonet_device_list pndevs = {
+	.list = LIST_HEAD_INIT(pndevs.list),
+	.lock = __SPIN_LOCK_UNLOCKED(pndevs.lock),
+};
+
+/* Allocate new Phonet device. */
+static struct phonet_device *__phonet_device_alloc(struct net_device *dev)
+{
+	struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC);
+	if (pnd == NULL)
+		return NULL;
+	pnd->netdev = dev;
+	bitmap_zero(pnd->addrs, 64);
+
+	list_add(&pnd->list, &pndevs.list);
+	return pnd;
+}
+
+static struct phonet_device *__phonet_get(struct net_device *dev)
+{
+	struct phonet_device *pnd;
+
+	list_for_each_entry(pnd, &pndevs.list, list) {
+		if (pnd->netdev == dev)
+			return pnd;
+	}
+	return NULL;
+}
+
+static void __phonet_device_free(struct phonet_device *pnd)
+{
+	list_del(&pnd->list);
+	kfree(pnd);
+}
+
+struct net_device *phonet_device_get(struct net *net)
+{
+	struct phonet_device *pnd;
+	struct net_device *dev;
+
+	spin_lock_bh(&pndevs.lock);
+	list_for_each_entry(pnd, &pndevs.list, list) {
+		dev = pnd->netdev;
+		BUG_ON(!dev);
+
+		if (dev_net(dev) == net &&
+			(dev->reg_state == NETREG_REGISTERED) &&
+			((pnd->netdev->flags & IFF_UP)) == IFF_UP)
+			break;
+		dev = NULL;
+	}
+	if (dev)
+		dev_hold(dev);
+	spin_unlock_bh(&pndevs.lock);
+	return dev;
+}
+
+int phonet_address_add(struct net_device *dev, u8 addr)
+{
+	struct phonet_device *pnd;
+	int err = 0;
+
+	spin_lock_bh(&pndevs.lock);
+	/* Find or create Phonet-specific device data */
+	pnd = __phonet_get(dev);
+	if (pnd == NULL)
+		pnd = __phonet_device_alloc(dev);
+	if (unlikely(pnd == NULL))
+		err = -ENOMEM;
+	else if (test_and_set_bit(addr >> 2, pnd->addrs))
+		err = -EEXIST;
+	spin_unlock_bh(&pndevs.lock);
+	return err;
+}
+
+int phonet_address_del(struct net_device *dev, u8 addr)
+{
+	struct phonet_device *pnd;
+	int err = 0;
+
+	spin_lock_bh(&pndevs.lock);
+	pnd = __phonet_get(dev);
+	if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs))
+		err = -EADDRNOTAVAIL;
+	if (bitmap_empty(pnd->addrs, 64))
+		__phonet_device_free(pnd);
+	spin_unlock_bh(&pndevs.lock);
+	return err;
+}
+
+/* Gets a source address toward a destination, through a interface. */
+u8 phonet_address_get(struct net_device *dev, u8 addr)
+{
+	struct phonet_device *pnd;
+
+	spin_lock_bh(&pndevs.lock);
+	pnd = __phonet_get(dev);
+	if (pnd) {
+		BUG_ON(bitmap_empty(pnd->addrs, 64));
+
+		/* Use same source address as destination, if possible */
+		if (!test_bit(addr >> 2, pnd->addrs))
+			addr = find_first_bit(pnd->addrs, 64) << 2;
+	} else
+		addr = PN_NO_ADDR;
+	spin_unlock_bh(&pndevs.lock);
+	return addr;
+}
+
+int phonet_address_lookup(u8 addr)
+{
+	struct phonet_device *pnd;
+
+	spin_lock_bh(&pndevs.lock);
+	list_for_each_entry(pnd, &pndevs.list, list) {
+		/* Don't allow unregistering devices! */
+		if ((pnd->netdev->reg_state != NETREG_REGISTERED) ||
+				((pnd->netdev->flags & IFF_UP)) != IFF_UP)
+			continue;
+
+		if (test_bit(addr >> 2, pnd->addrs)) {
+			spin_unlock_bh(&pndevs.lock);
+			return 0;
+		}
+	}
+	spin_unlock_bh(&pndevs.lock);
+	return -EADDRNOTAVAIL;
+}
+
+/* notify Phonet of device events */
+static int phonet_device_notify(struct notifier_block *me, unsigned long what,
+				void *arg)
+{
+	struct net_device *dev = arg;
+
+	if (what == NETDEV_UNREGISTER) {
+		struct phonet_device *pnd;
+
+		/* Destroy phonet-specific device data */
+		spin_lock_bh(&pndevs.lock);
+		pnd = __phonet_get(dev);
+		if (pnd)
+			__phonet_device_free(pnd);
+		spin_unlock_bh(&pndevs.lock);
+	}
+	return 0;
+
+}
+
+static struct notifier_block phonet_device_notifier = {
+	.notifier_call = phonet_device_notify,
+	.priority = 0,
+};
+
+/* Initialize Phonet devices list */
+void phonet_device_init(void)
+{
+	register_netdevice_notifier(&phonet_device_notifier);
+}
+
+void phonet_device_exit(void)
+{
+	struct phonet_device *pnd, *n;
+
+	rtnl_unregister_all(PF_PHONET);
+	rtnl_lock();
+	spin_lock_bh(&pndevs.lock);
+
+	list_for_each_entry_safe(pnd, n, &pndevs.list, list)
+		__phonet_device_free(pnd);
+
+	spin_unlock_bh(&pndevs.lock);
+	rtnl_unlock();
+	unregister_netdevice_notifier(&phonet_device_notifier);
+}
-- 
cgit v1.2.3


From 8fb397406f6470f79040c41eec49af20900a9e3b Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:04:30 -0700
Subject: Phonet: Netlink interface

This provides support for configuring Phonet addresses, notifying
Phonet configuration changes, and dumping the configuration.

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/Makefile     |   1 +
 net/phonet/af_phonet.c  |   1 +
 net/phonet/pn_netlink.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 188 insertions(+)
 create mode 100644 net/phonet/pn_netlink.c

(limited to 'net')

diff --git a/net/phonet/Makefile b/net/phonet/Makefile
index 980a3866c9a..4143c3e1dfd 100644
--- a/net/phonet/Makefile
+++ b/net/phonet/Makefile
@@ -2,4 +2,5 @@ obj-$(CONFIG_PHONET) += phonet.o
 
 phonet-objs := \
 	pn_dev.o \
+	pn_netlink.o \
 	af_phonet.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index a8ba6f177b2..5c729ba5693 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -203,6 +203,7 @@ static int __init phonet_init(void)
 
 	phonet_device_init();
 	dev_add_pack(&phonet_packet_type);
+	phonet_netlink_register();
 	return 0;
 }
 
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
new file mode 100644
index 00000000000..b1ea19a230d
--- /dev/null
+++ b/net/phonet/pn_netlink.c
@@ -0,0 +1,186 @@
+/*
+ * File: pn_netlink.c
+ *
+ * Phonet netlink interface
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/phonet.h>
+#include <net/sock.h>
+#include <net/phonet/pn_dev.h>
+
+static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
+		     u32 pid, u32 seq, int event);
+
+static void rtmsg_notify(int event, struct net_device *dev, u8 addr)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+			nla_total_size(1), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+	err = fill_addr(skb, dev, addr, 0, 0, event);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	err = rtnl_notify(skb, dev_net(dev), 0,
+			  RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
+}
+
+static int newaddr_doit(struct sk_buff *skb, struct nlmsghdr *nlm, void *attr)
+{
+	struct rtattr **rta = attr;
+	struct ifaddrmsg *ifm = NLMSG_DATA(nlm);
+	struct net_device *dev;
+	int err;
+	u8 pnaddr;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ASSERT_RTNL();
+
+	if (rta[IFA_LOCAL - 1] == NULL)
+		return -EINVAL;
+
+	dev = __dev_get_by_index(&init_net, ifm->ifa_index);
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (ifm->ifa_prefixlen > 0)
+		return -EINVAL;
+
+	memcpy(&pnaddr, RTA_DATA(rta[IFA_LOCAL - 1]), 1);
+
+	err = phonet_address_add(dev, pnaddr);
+	if (!err)
+		rtmsg_notify(RTM_NEWADDR, dev, pnaddr);
+	return err;
+}
+
+static int deladdr_doit(struct sk_buff *skb, struct nlmsghdr *nlm, void *attr)
+{
+	struct rtattr **rta = attr;
+	struct ifaddrmsg *ifm = NLMSG_DATA(nlm);
+	struct net_device *dev;
+	int err;
+	u8 pnaddr;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ASSERT_RTNL();
+
+	if (rta[IFA_LOCAL - 1] == NULL)
+		return -EINVAL;
+
+	dev = __dev_get_by_index(&init_net, ifm->ifa_index);
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (ifm->ifa_prefixlen > 0)
+		return -EADDRNOTAVAIL;
+
+	memcpy(&pnaddr, RTA_DATA(rta[IFA_LOCAL - 1]), 1);
+
+	err = phonet_address_del(dev, pnaddr);
+	if (!err)
+		rtmsg_notify(RTM_DELADDR, dev, pnaddr);
+	return err;
+}
+
+static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
+			u32 pid, u32 seq, int event)
+{
+	struct ifaddrmsg *ifm;
+	struct nlmsghdr *nlh;
+	unsigned int orig_len = skb->len;
+
+	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct ifaddrmsg));
+	ifm = NLMSG_DATA(nlh);
+	ifm->ifa_family = AF_PHONET;
+	ifm->ifa_prefixlen = 0;
+	ifm->ifa_flags = IFA_F_PERMANENT;
+	ifm->ifa_scope = RT_SCOPE_HOST;
+	ifm->ifa_index = dev->ifindex;
+	RTA_PUT(skb, IFA_LOCAL, 1, &addr);
+	nlh->nlmsg_len = skb->len - orig_len;
+
+	return 0;
+
+nlmsg_failure:
+rtattr_failure:
+	skb_trim(skb, orig_len);
+
+	return -1;
+}
+
+static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct phonet_device *pnd;
+	int dev_idx = 0, dev_start_idx = cb->args[0];
+	int addr_idx = 0, addr_start_idx = cb->args[1];
+
+	spin_lock_bh(&pndevs.lock);
+	list_for_each_entry(pnd, &pndevs.list, list) {
+		u8 addr;
+
+		if (dev_idx > dev_start_idx)
+			addr_start_idx = 0;
+		if (dev_idx++ < dev_start_idx)
+			continue;
+
+		addr_idx = 0;
+		for (addr = find_first_bit(pnd->addrs, 64); addr < 64;
+			addr = find_next_bit(pnd->addrs, 64, 1+addr)) {
+			if (addr_idx++ < addr_start_idx)
+				continue;
+
+			if (fill_addr(skb, pnd->netdev, addr << 2,
+					 NETLINK_CB(cb->skb).pid,
+					cb->nlh->nlmsg_seq, RTM_NEWADDR))
+				goto out;
+		}
+	}
+
+out:
+	spin_unlock_bh(&pndevs.lock);
+	cb->args[0] = dev_idx;
+	cb->args[1] = addr_idx;
+
+	return skb->len;
+}
+
+void __init phonet_netlink_register(void)
+{
+	rtnl_register(PF_PHONET, RTM_NEWADDR, newaddr_doit, NULL);
+	rtnl_register(PF_PHONET, RTM_DELADDR, deladdr_doit, NULL);
+	rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit);
+}
-- 
cgit v1.2.3


From ba113a94b7503ee23ffe819e7045134b0c1d31de Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:05:19 -0700
Subject: Phonet: common socket glue

This provides the socket API for the Phonet protocols family.

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/Makefile    |   1 +
 net/phonet/af_phonet.c |  28 ++++-
 net/phonet/socket.c    | 311 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 337 insertions(+), 3 deletions(-)
 create mode 100644 net/phonet/socket.c

(limited to 'net')

diff --git a/net/phonet/Makefile b/net/phonet/Makefile
index 4143c3e1dfd..c1d671de783 100644
--- a/net/phonet/Makefile
+++ b/net/phonet/Makefile
@@ -3,4 +3,5 @@ obj-$(CONFIG_PHONET) += phonet.o
 phonet-objs := \
 	pn_dev.o \
 	pn_netlink.o \
+	socket.o \
 	af_phonet.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 5c729ba5693..ba54d53020f 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -41,6 +41,8 @@ static inline void phonet_proto_put(struct phonet_protocol *pp);
 
 static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
 {
+	struct sock *sk;
+	struct pn_sock *pn;
 	struct phonet_protocol *pnp;
 	int err;
 
@@ -69,8 +71,22 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
 		goto out;
 	}
 
-	/* TODO: create and init the struct sock */
-	err = -EPROTONOSUPPORT;
+	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot);
+	if (sk == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	sock_init_data(sock, sk);
+	sock->state = SS_UNCONNECTED;
+	sock->ops = pnp->ops;
+	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+	sk->sk_protocol = protocol;
+	pn = pn_sk(sk);
+	pn->sobject = 0;
+	pn->resource = 0;
+	sk->sk_prot->init(sk);
+	err = 0;
 
 out:
 	phonet_proto_put(pnp);
@@ -94,6 +110,7 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
 			struct net_device *orig_dev)
 {
 	struct phonethdr *ph;
+	struct sock *sk;
 	struct sockaddr_pn sa;
 	u16 len;
 
@@ -118,7 +135,12 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (pn_sockaddr_get_addr(&sa) == 0)
 		goto out; /* currently, we cannot be device 0 */
 
-	/* TODO: put packets to sockets backlog */
+	sk = pn_find_sock_by_sa(&sa);
+	if (sk == NULL)
+		goto out;
+
+	/* Push data to the socket (or other sockets connected to it). */
+	return sk_receive_skb(sk, skb, 0);
 
 out:
 	kfree_skb(skb);
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
new file mode 100644
index 00000000000..99a4945d565
--- /dev/null
+++ b/net/phonet/socket.c
@@ -0,0 +1,311 @@
+/*
+ * File: socket.c
+ *
+ * Phonet sockets
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pn_dev.h>
+
+static int pn_socket_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		sock->sk = NULL;
+		sk->sk_prot->close(sk, 0);
+	}
+	return 0;
+}
+
+static struct  {
+	struct hlist_head hlist;
+	spinlock_t lock;
+} pnsocks = {
+	.hlist = HLIST_HEAD_INIT,
+	.lock = __SPIN_LOCK_UNLOCKED(pnsocks.lock),
+};
+
+/*
+ * Find address based on socket address, match only certain fields.
+ * Also grab sock if it was found. Remember to sock_put it later.
+ */
+struct sock *pn_find_sock_by_sa(const struct sockaddr_pn *spn)
+{
+	struct hlist_node *node;
+	struct sock *sknode;
+	struct sock *rval = NULL;
+	u16 obj = pn_sockaddr_get_object(spn);
+	u8 res = spn->spn_resource;
+
+	spin_lock_bh(&pnsocks.lock);
+
+	sk_for_each(sknode, node, &pnsocks.hlist) {
+		struct pn_sock *pn = pn_sk(sknode);
+		BUG_ON(!pn->sobject); /* unbound socket */
+
+		if (pn_port(obj)) {
+			/* Look up socket by port */
+			if (pn_port(pn->sobject) != pn_port(obj))
+				continue;
+		} else {
+			/* If port is zero, look up by resource */
+			if (pn->resource != res)
+				continue;
+		}
+		if (pn_addr(pn->sobject)
+		 && pn_addr(pn->sobject) != pn_addr(obj))
+			continue;
+
+		rval = sknode;
+		sock_hold(sknode);
+		break;
+	}
+
+	spin_unlock_bh(&pnsocks.lock);
+
+	return rval;
+
+}
+
+void pn_sock_hash(struct sock *sk)
+{
+	spin_lock_bh(&pnsocks.lock);
+	sk_add_node(sk, &pnsocks.hlist);
+	spin_unlock_bh(&pnsocks.lock);
+}
+EXPORT_SYMBOL(pn_sock_hash);
+
+void pn_sock_unhash(struct sock *sk)
+{
+	spin_lock_bh(&pnsocks.lock);
+	sk_del_node_init(sk);
+	spin_unlock_bh(&pnsocks.lock);
+}
+EXPORT_SYMBOL(pn_sock_unhash);
+
+static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
+{
+	struct sock *sk = sock->sk;
+	struct pn_sock *pn = pn_sk(sk);
+	struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+	int err;
+	u16 handle;
+	u8 saddr;
+
+	if (sk->sk_prot->bind)
+		return sk->sk_prot->bind(sk, addr, len);
+
+	if (len < sizeof(struct sockaddr_pn))
+		return -EINVAL;
+	if (spn->spn_family != AF_PHONET)
+		return -EAFNOSUPPORT;
+
+	handle = pn_sockaddr_get_object((struct sockaddr_pn *)addr);
+	saddr = pn_addr(handle);
+	if (saddr && phonet_address_lookup(saddr))
+		return -EADDRNOTAVAIL;
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_CLOSE || pn_port(pn->sobject)) {
+		err = -EINVAL; /* attempt to rebind */
+		goto out;
+	}
+	err = sk->sk_prot->get_port(sk, pn_port(handle));
+	if (err)
+		goto out;
+
+	/* get_port() sets the port, bind() sets the address if applicable */
+	pn->sobject = pn_object(saddr, pn_port(pn->sobject));
+	pn->resource = spn->spn_resource;
+
+	/* Enable RX on the socket */
+	sk->sk_prot->hash(sk);
+out:
+	release_sock(sk);
+	return err;
+}
+
+static int pn_socket_autobind(struct socket *sock)
+{
+	struct sockaddr_pn sa;
+	int err;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.spn_family = AF_PHONET;
+	err = pn_socket_bind(sock, (struct sockaddr *)&sa,
+				sizeof(struct sockaddr_pn));
+	if (err != -EINVAL)
+		return err;
+	BUG_ON(!pn_port(pn_sk(sock->sk)->sobject));
+	return 0; /* socket was already bound */
+}
+
+static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
+				int *sockaddr_len, int peer)
+{
+	struct sock *sk = sock->sk;
+	struct pn_sock *pn = pn_sk(sk);
+
+	memset(addr, 0, sizeof(struct sockaddr_pn));
+	addr->sa_family = AF_PHONET;
+	if (!peer) /* Race with bind() here is userland's problem. */
+		pn_sockaddr_set_object((struct sockaddr_pn *)addr,
+					pn->sobject);
+
+	*sockaddr_len = sizeof(struct sockaddr_pn);
+	return 0;
+}
+
+static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
+				unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct pn_sock *pn = pn_sk(sk);
+
+	if (cmd == SIOCPNGETOBJECT) {
+		struct net_device *dev;
+		u16 handle;
+		u8 saddr;
+
+		if (get_user(handle, (__u16 __user *)arg))
+			return -EFAULT;
+
+		lock_sock(sk);
+		if (sk->sk_bound_dev_if)
+			dev = dev_get_by_index(sock_net(sk),
+						sk->sk_bound_dev_if);
+		else
+			dev = phonet_device_get(sock_net(sk));
+		if (dev && (dev->flags & IFF_UP))
+			saddr = phonet_address_get(dev, pn_addr(handle));
+		else
+			saddr = PN_NO_ADDR;
+		release_sock(sk);
+
+		if (dev)
+			dev_put(dev);
+		if (saddr == PN_NO_ADDR)
+			return -EHOSTUNREACH;
+
+		handle = pn_object(saddr, pn_port(pn->sobject));
+		return put_user(handle, (__u16 __user *)arg);
+	}
+
+	return sk->sk_prot->ioctl(sk, cmd, arg);
+}
+
+static int pn_socket_sendmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *m, size_t total_len)
+{
+	struct sock *sk = sock->sk;
+
+	if (pn_socket_autobind(sock))
+		return -EAGAIN;
+
+	return sk->sk_prot->sendmsg(iocb, sk, m, total_len);
+}
+
+const struct proto_ops phonet_dgram_ops = {
+	.family		= AF_PHONET,
+	.owner		= THIS_MODULE,
+	.release	= pn_socket_release,
+	.bind		= pn_socket_bind,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= pn_socket_getname,
+	.poll		= datagram_poll,
+	.ioctl		= pn_socket_ioctl,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = sock_no_setsockopt,
+	.compat_getsockopt = sock_no_getsockopt,
+#endif
+	.sendmsg	= pn_socket_sendmsg,
+	.recvmsg	= sock_common_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+static DEFINE_MUTEX(port_mutex);
+
+/* allocate port for a socket */
+int pn_sock_get_port(struct sock *sk, unsigned short sport)
+{
+	static int port_cur;
+	struct pn_sock *pn = pn_sk(sk);
+	struct sockaddr_pn try_sa;
+	struct sock *tmpsk;
+
+	memset(&try_sa, 0, sizeof(struct sockaddr_pn));
+	try_sa.spn_family = AF_PHONET;
+
+	mutex_lock(&port_mutex);
+
+	if (!sport) {
+		/* search free port */
+		int port, pmin = 0x40, pmax = 0x7f;
+
+		for (port = pmin; port <= pmax; port++) {
+			port_cur++;
+			if (port_cur < pmin || port_cur > pmax)
+				port_cur = pmin;
+
+			pn_sockaddr_set_port(&try_sa, port_cur);
+			tmpsk = pn_find_sock_by_sa(&try_sa);
+			if (tmpsk == NULL) {
+				sport = port_cur;
+				goto found;
+			} else
+				sock_put(tmpsk);
+		}
+	} else {
+		/* try to find specific port */
+		pn_sockaddr_set_port(&try_sa, sport);
+		tmpsk = pn_find_sock_by_sa(&try_sa);
+		if (tmpsk == NULL)
+			/* No sock there! We can use that port... */
+			goto found;
+		else
+			sock_put(tmpsk);
+	}
+	mutex_unlock(&port_mutex);
+
+	/* the port must be in use already */
+	return -EADDRINUSE;
+
+found:
+	mutex_unlock(&port_mutex);
+	pn->sobject = pn_object(pn_addr(pn->sobject), sport);
+	return 0;
+}
+EXPORT_SYMBOL(pn_sock_get_port);
-- 
cgit v1.2.3


From 107d0d9b8d9a236883db72841fb61cedd5be845e Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:05:57 -0700
Subject: Phonet: Phonet datagram transport protocol

This provides the basic SOCK_DGRAM transport protocol for Phonet.

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/Makefile    |   1 +
 net/phonet/af_phonet.c | 106 ++++++++++++++++++++++++++
 net/phonet/datagram.c  | 197 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 304 insertions(+)
 create mode 100644 net/phonet/datagram.c

(limited to 'net')

diff --git a/net/phonet/Makefile b/net/phonet/Makefile
index c1d671de783..d218abc3f06 100644
--- a/net/phonet/Makefile
+++ b/net/phonet/Makefile
@@ -4,4 +4,5 @@ phonet-objs := \
 	pn_dev.o \
 	pn_netlink.o \
 	socket.o \
+	datagram.o \
 	af_phonet.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index ba54d53020f..e6771d3961c 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -99,6 +99,101 @@ static struct net_proto_family phonet_proto_family = {
 	.owner = THIS_MODULE,
 };
 
+/*
+ * Prepends an ISI header and sends a datagram.
+ */
+static int pn_send(struct sk_buff *skb, struct net_device *dev,
+			u16 dst, u16 src, u8 res)
+{
+	struct phonethdr *ph;
+	int err;
+
+	if (skb->len + 2 > 0xffff) {
+		/* Phonet length field would overflow */
+		err = -EMSGSIZE;
+		goto drop;
+	}
+
+	skb_reset_transport_header(skb);
+	WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */
+	skb_push(skb, sizeof(struct phonethdr));
+	skb_reset_network_header(skb);
+	ph = pn_hdr(skb);
+	ph->pn_rdev = pn_dev(dst);
+	ph->pn_sdev = pn_dev(src);
+	ph->pn_res = res;
+	ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph));
+	ph->pn_robj = pn_obj(dst);
+	ph->pn_sobj = pn_obj(src);
+
+	skb->protocol = htons(ETH_P_PHONET);
+	skb->priority = 0;
+	skb->dev = dev;
+
+	if (pn_addr(src) == pn_addr(dst)) {
+		skb_reset_mac_header(skb);
+		skb->pkt_type = PACKET_LOOPBACK;
+		skb_orphan(skb);
+		netif_rx_ni(skb);
+		err = 0;
+	} else {
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+					NULL, NULL, skb->len);
+		if (err < 0) {
+			err = -EHOSTUNREACH;
+			goto drop;
+		}
+		err = dev_queue_xmit(skb);
+	}
+
+	return err;
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
+/*
+ * Create a Phonet header for the skb and send it out. Returns
+ * non-zero error code if failed. The skb is freed then.
+ */
+int pn_skb_send(struct sock *sk, struct sk_buff *skb,
+		const struct sockaddr_pn *target)
+{
+	struct net_device *dev;
+	struct pn_sock *pn = pn_sk(sk);
+	int err;
+	u16 src;
+	u8 daddr = pn_sockaddr_get_addr(target), saddr = PN_NO_ADDR;
+
+	err = -EHOSTUNREACH;
+	if (sk->sk_bound_dev_if)
+		dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if);
+	else
+		dev = phonet_device_get(sock_net(sk));
+	if (!dev || !(dev->flags & IFF_UP))
+		goto drop;
+
+	saddr = phonet_address_get(dev, daddr);
+	if (saddr == PN_NO_ADDR)
+		goto drop;
+
+	src = pn->sobject;
+	if (!pn_addr(src))
+		src = pn_object(saddr, pn_obj(src));
+
+	err = pn_send(skb, dev, pn_sockaddr_get_object(target),
+			src, pn_sockaddr_get_resource(target));
+	dev_put(dev);
+	return err;
+
+drop:
+	kfree_skb(skb);
+	if (dev)
+		dev_put(dev);
+	return err;
+}
+EXPORT_SYMBOL(pn_skb_send);
+
 /* packet type functions */
 
 /*
@@ -226,11 +321,22 @@ static int __init phonet_init(void)
 	phonet_device_init();
 	dev_add_pack(&phonet_packet_type);
 	phonet_netlink_register();
+
+	err = isi_register();
+	if (err)
+		goto err;
 	return 0;
+
+err:
+	sock_unregister(AF_PHONET);
+	dev_remove_pack(&phonet_packet_type);
+	phonet_device_exit();
+	return err;
 }
 
 static void __exit phonet_exit(void)
 {
+	isi_unregister();
 	sock_unregister(AF_PHONET);
 	dev_remove_pack(&phonet_packet_type);
 	phonet_device_exit();
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
new file mode 100644
index 00000000000..e087862ed7e
--- /dev/null
+++ b/net/phonet/datagram.c
@@ -0,0 +1,197 @@
+/*
+ * File: datagram.c
+ *
+ * Datagram (ISI) Phonet sockets
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <asm/ioctls.h>
+#include <net/sock.h>
+
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+
+static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
+/* associated socket ceases to exist */
+static void pn_sock_close(struct sock *sk, long timeout)
+{
+	sk_common_release(sk);
+}
+
+static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	struct sk_buff *skb;
+	int answ;
+
+	switch (cmd) {
+	case SIOCINQ:
+		lock_sock(sk);
+		skb = skb_peek(&sk->sk_receive_queue);
+		answ = skb ? skb->len : 0;
+		release_sock(sk);
+		return put_user(answ, (int __user *)arg);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+/* Destroy socket. All references are gone. */
+static void pn_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
+static int pn_init(struct sock *sk)
+{
+	sk->sk_destruct = pn_destruct;
+	return 0;
+}
+
+static int pn_sendmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len)
+{
+	struct sockaddr_pn *target;
+	struct sk_buff *skb;
+	int err;
+
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	if (msg->msg_name == NULL)
+		return -EDESTADDRREQ;
+
+	if (msg->msg_namelen < sizeof(struct sockaddr_pn))
+		return -EINVAL;
+
+	target = (struct sockaddr_pn *)msg->msg_name;
+	if (target->spn_family != AF_PHONET)
+		return -EAFNOSUPPORT;
+
+	skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len,
+					msg->msg_flags & MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		return err;
+	skb_reserve(skb, MAX_PHONET_HEADER);
+
+	err = memcpy_fromiovec((void *)skb_put(skb, len), msg->msg_iov, len);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	/*
+	 * Fill in the Phonet header and
+	 * finally pass the packet forwards.
+	 */
+	err = pn_skb_send(sk, skb, target);
+
+	/* If ok, return len. */
+	return (err >= 0) ? len : err;
+}
+
+static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len, int noblock,
+			int flags, int *addr_len)
+{
+	struct sk_buff *skb = NULL;
+	struct sockaddr_pn sa;
+	int rval = -EOPNOTSUPP;
+	int copylen;
+
+	if (flags & MSG_OOB)
+		goto out_nofree;
+
+	if (addr_len)
+		*addr_len = sizeof(sa);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &rval);
+	if (skb == NULL)
+		goto out_nofree;
+
+	pn_skb_get_src_sockaddr(skb, &sa);
+
+	copylen = skb->len;
+	if (len < copylen) {
+		msg->msg_flags |= MSG_TRUNC;
+		copylen = len;
+	}
+
+	rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen);
+	if (rval) {
+		rval = -EFAULT;
+		goto out;
+	}
+
+	rval = (flags & MSG_TRUNC) ? skb->len : copylen;
+
+	if (msg->msg_name != NULL)
+		memcpy(msg->msg_name, &sa, sizeof(struct sockaddr_pn));
+
+out:
+	skb_free_datagram(sk, skb);
+
+out_nofree:
+	return rval;
+}
+
+/* Queue an skb for a sock. */
+static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int err = sock_queue_rcv_skb(sk, skb);
+	if (err < 0)
+		kfree_skb(skb);
+	return err ? NET_RX_DROP : NET_RX_SUCCESS;
+}
+
+/* Module registration */
+static struct proto pn_proto = {
+	.close		= pn_sock_close,
+	.ioctl		= pn_ioctl,
+	.init		= pn_init,
+	.sendmsg	= pn_sendmsg,
+	.recvmsg	= pn_recvmsg,
+	.backlog_rcv	= pn_backlog_rcv,
+	.hash		= pn_sock_hash,
+	.unhash		= pn_sock_unhash,
+	.get_port	= pn_sock_get_port,
+	.obj_size	= sizeof(struct pn_sock),
+	.owner		= THIS_MODULE,
+	.name		= "PHONET",
+};
+
+static struct phonet_protocol pn_dgram_proto = {
+	.ops		= &phonet_dgram_ops,
+	.prot		= &pn_proto,
+	.sock_type	= SOCK_DGRAM,
+};
+
+int __init isi_register(void)
+{
+	return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto);
+}
+
+void __exit isi_unregister(void)
+{
+	phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto);
+}
-- 
cgit v1.2.3


From 5f77076d75d35c9f5619e1f9d7e7428a627f65e6 Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:08:04 -0700
Subject: Phonet: provide MAC header operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/af_phonet.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'net')

diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index e6771d3961c..51397ff308b 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -99,6 +99,35 @@ static struct net_proto_family phonet_proto_family = {
 	.owner = THIS_MODULE,
 };
 
+/* Phonet device header operations */
+static int pn_header_create(struct sk_buff *skb, struct net_device *dev,
+				unsigned short type, const void *daddr,
+				const void *saddr, unsigned len)
+{
+	u8 *media = skb_push(skb, 1);
+
+	if (type != ETH_P_PHONET)
+		return -1;
+
+	if (!saddr)
+		saddr = dev->dev_addr;
+	*media = *(const u8 *)saddr;
+	return 1;
+}
+
+static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+	const u8 *media = skb_mac_header(skb);
+	*haddr = *media;
+	return 1;
+}
+
+struct header_ops phonet_header_ops = {
+	.create = pn_header_create,
+	.parse = pn_header_parse,
+};
+EXPORT_SYMBOL(phonet_header_ops);
+
 /*
  * Prepends an ISI header and sends a datagram.
  */
-- 
cgit v1.2.3


From 87ab4e20b445c6d2d2727ab4f96fa17f7259511e Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:08:39 -0700
Subject: Phonet: proc interface for port range

Phonet endpoints are bound to individual ports.
This provides a /proc/sys/net/phonet (or sysctl) interface for
selecting the range of automatically allocated ports (much like the
ip_local_port_range with IPv4).

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/Makefile    |   1 +
 net/phonet/af_phonet.c |   3 ++
 net/phonet/socket.c    |   3 +-
 net/phonet/sysctl.c    | 113 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 net/phonet/sysctl.c

(limited to 'net')

diff --git a/net/phonet/Makefile b/net/phonet/Makefile
index d218abc3f06..ae9c3ed5be8 100644
--- a/net/phonet/Makefile
+++ b/net/phonet/Makefile
@@ -5,4 +5,5 @@ phonet-objs := \
 	pn_netlink.o \
 	socket.o \
 	datagram.o \
+	sysctl.o \
 	af_phonet.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 51397ff308b..50dc258d5aa 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -350,6 +350,7 @@ static int __init phonet_init(void)
 	phonet_device_init();
 	dev_add_pack(&phonet_packet_type);
 	phonet_netlink_register();
+	phonet_sysctl_init();
 
 	err = isi_register();
 	if (err)
@@ -357,6 +358,7 @@ static int __init phonet_init(void)
 	return 0;
 
 err:
+	phonet_sysctl_exit();
 	sock_unregister(AF_PHONET);
 	dev_remove_pack(&phonet_packet_type);
 	phonet_device_exit();
@@ -366,6 +368,7 @@ err:
 static void __exit phonet_exit(void)
 {
 	isi_unregister();
+	phonet_sysctl_exit();
 	sock_unregister(AF_PHONET);
 	dev_remove_pack(&phonet_packet_type);
 	phonet_device_exit();
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 99a4945d565..dfd4061646d 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -273,8 +273,9 @@ int pn_sock_get_port(struct sock *sk, unsigned short sport)
 
 	if (!sport) {
 		/* search free port */
-		int port, pmin = 0x40, pmax = 0x7f;
+		int port, pmin, pmax;
 
+		phonet_get_local_port_range(&pmin, &pmax);
 		for (port = pmin; port <= pmax; port++) {
 			port_cur++;
 			if (port_cur < pmin || port_cur > pmax)
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
new file mode 100644
index 00000000000..600a4309b8c
--- /dev/null
+++ b/net/phonet/sysctl.c
@@ -0,0 +1,113 @@
+/*
+ * File: sysctl.c
+ *
+ * Phonet /proc/sys/net/phonet interface implementation
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/seqlock.h>
+#include <linux/sysctl.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#define DYNAMIC_PORT_MIN	0x40
+#define DYNAMIC_PORT_MAX	0x7f
+
+static DEFINE_SEQLOCK(local_port_range_lock);
+static int local_port_range_min[2] = {0, 0};
+static int local_port_range_max[2] = {1023, 1023};
+static int local_port_range[2] = {DYNAMIC_PORT_MIN, DYNAMIC_PORT_MAX};
+static struct ctl_table_header *phonet_table_hrd;
+
+static void set_local_port_range(int range[2])
+{
+	write_seqlock(&local_port_range_lock);
+	local_port_range[0] = range[0];
+	local_port_range[1] = range[1];
+	write_sequnlock(&local_port_range_lock);
+}
+
+void phonet_get_local_port_range(int *min, int *max)
+{
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&local_port_range_lock);
+		if (min)
+			*min = local_port_range[0];
+		if (max)
+			*max = local_port_range[1];
+	} while (read_seqretry(&local_port_range_lock, seq));
+}
+
+static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	int range[2] = {local_port_range[0], local_port_range[1]};
+	ctl_table tmp = {
+		.data = &range,
+		.maxlen = sizeof(range),
+		.mode = table->mode,
+		.extra1 = &local_port_range_min,
+		.extra2 = &local_port_range_max,
+	};
+
+	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		if (range[1] < range[0])
+			ret = -EINVAL;
+		else
+			set_local_port_range(range);
+	}
+
+	return ret;
+}
+
+static struct ctl_table phonet_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "local_port_range",
+		.data		= &local_port_range,
+		.maxlen		= sizeof(local_port_range),
+		.mode		= 0644,
+		.proc_handler	= &proc_local_port_range,
+		.strategy	= NULL,
+	},
+	{ .ctl_name = 0 }
+};
+
+struct ctl_path phonet_ctl_path[] = {
+	{ .procname = "net", .ctl_name = CTL_NET, },
+	{ .procname = "phonet", .ctl_name = CTL_UNNUMBERED, },
+	{ },
+};
+
+int __init phonet_sysctl_init(void)
+{
+	phonet_table_hrd = register_sysctl_paths(phonet_ctl_path, phonet_table);
+	return phonet_table_hrd == NULL ? -ENOMEM : 0;
+}
+
+void phonet_sysctl_exit(void)
+{
+	unregister_sysctl_table(phonet_table_hrd);
+}
-- 
cgit v1.2.3


From be0c52bfed7f7828494fa00060efd5d758e92580 Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:09:13 -0700
Subject: Phonet: emit errors when a packet cannot be delivered locally

When there is no listener socket for a received packet, send an error
back to the sender.

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/af_phonet.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 50dc258d5aa..1d8df6b7e3d 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -132,7 +132,7 @@ EXPORT_SYMBOL(phonet_header_ops);
  * Prepends an ISI header and sends a datagram.
  */
 static int pn_send(struct sk_buff *skb, struct net_device *dev,
-			u16 dst, u16 src, u8 res)
+			u16 dst, u16 src, u8 res, u8 irq)
 {
 	struct phonethdr *ph;
 	int err;
@@ -163,7 +163,10 @@ static int pn_send(struct sk_buff *skb, struct net_device *dev,
 		skb_reset_mac_header(skb);
 		skb->pkt_type = PACKET_LOOPBACK;
 		skb_orphan(skb);
-		netif_rx_ni(skb);
+		if (irq)
+			netif_rx(skb);
+		else
+			netif_rx_ni(skb);
 		err = 0;
 	} else {
 		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
@@ -181,6 +184,19 @@ drop:
 	return err;
 }
 
+static int pn_raw_send(const void *data, int len, struct net_device *dev,
+			u16 dst, u16 src, u8 res)
+{
+	struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	skb_reserve(skb, MAX_PHONET_HEADER);
+	__skb_put(skb, len);
+	skb_copy_to_linear_data(skb, data, len);
+	return pn_send(skb, dev, dst, src, res, 1);
+}
+
 /*
  * Create a Phonet header for the skb and send it out. Returns
  * non-zero error code if failed. The skb is freed then.
@@ -211,7 +227,7 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb,
 		src = pn_object(saddr, pn_obj(src));
 
 	err = pn_send(skb, dev, pn_sockaddr_get_object(target),
-			src, pn_sockaddr_get_resource(target));
+			src, pn_sockaddr_get_resource(target), 0);
 	dev_put(dev);
 	return err;
 
@@ -223,6 +239,73 @@ drop:
 }
 EXPORT_SYMBOL(pn_skb_send);
 
+/* Do not send an error message in response to an error message */
+static inline int can_respond(struct sk_buff *skb)
+{
+	const struct phonethdr *ph;
+	const struct phonetmsg *pm;
+	u8 submsg_id;
+
+	if (!pskb_may_pull(skb, 3))
+		return 0;
+
+	ph = pn_hdr(skb);
+	if (phonet_address_get(skb->dev, ph->pn_rdev) != ph->pn_rdev)
+		return 0; /* we are not the destination */
+	if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5))
+		return 0;
+
+	ph = pn_hdr(skb); /* re-acquires the pointer */
+	pm = pn_msg(skb);
+	if (pm->pn_msg_id != PN_COMMON_MESSAGE)
+		return 1;
+	submsg_id = (ph->pn_res == PN_PREFIX)
+		? pm->pn_e_submsg_id : pm->pn_submsg_id;
+	if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP &&
+		pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP)
+		return 1;
+	return 0;
+}
+
+static int send_obj_unreachable(struct sk_buff *rskb)
+{
+	const struct phonethdr *oph = pn_hdr(rskb);
+	const struct phonetmsg *opm = pn_msg(rskb);
+	struct phonetmsg resp;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.pn_trans_id = opm->pn_trans_id;
+	resp.pn_msg_id = PN_COMMON_MESSAGE;
+	if (oph->pn_res == PN_PREFIX) {
+		resp.pn_e_res_id = opm->pn_e_res_id;
+		resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
+		resp.pn_e_orig_msg_id = opm->pn_msg_id;
+		resp.pn_e_status = 0;
+	} else {
+		resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
+		resp.pn_orig_msg_id = opm->pn_msg_id;
+		resp.pn_status = 0;
+	}
+	return pn_raw_send(&resp, sizeof(resp), rskb->dev,
+				pn_object(oph->pn_sdev, oph->pn_sobj),
+				pn_object(oph->pn_rdev, oph->pn_robj),
+				oph->pn_res);
+}
+
+static int send_reset_indications(struct sk_buff *rskb)
+{
+	struct phonethdr *oph = pn_hdr(rskb);
+	static const u8 data[4] = {
+		0x00 /* trans ID */, 0x10 /* subscribe msg */,
+		0x00 /* subscription count */, 0x00 /* dummy */
+	};
+
+	return pn_raw_send(data, sizeof(data), rskb->dev,
+				pn_object(oph->pn_sdev, 0x00),
+				pn_object(oph->pn_rdev, oph->pn_robj), 0x10);
+}
+
+
 /* packet type functions */
 
 /*
@@ -260,8 +343,13 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto out; /* currently, we cannot be device 0 */
 
 	sk = pn_find_sock_by_sa(&sa);
-	if (sk == NULL)
+	if (sk == NULL) {
+		if (can_respond(skb)) {
+			send_obj_unreachable(skb);
+			send_reset_indications(skb);
+		}
 		goto out;
+	}
 
 	/* Push data to the socket (or other sockets connected to it). */
 	return sk_receive_skb(sk, skb, 0);
-- 
cgit v1.2.3


From 0b815a1a6d43ab498674b8430c8c35ab08487a16 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 22 Sep 2008 21:28:11 -0700
Subject: net: network device name ifalias support

This patch add support for keeping an additional character alias
associated with an network interface. This is useful for maintaining
the SNMP ifAlias value which is a user defined value. Routers use this
to hold information like which circuit or line it is connected to. It
is just an arbitrary text label on the network device.

There are two exposed interfaces with this patch, the value can be
read/written either via netlink or sysfs.

This could be maintained just by the snmp daemon, but it is more
generally useful for other management tools, and the kernel is good
place to act as an agreed upon interface to store it.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 23 +++++++++++++++++++++++
 net/core/net-sysfs.c | 36 ++++++++++++++++++++++++++++++++++++
 net/core/rtnetlink.c | 13 +++++++++++++
 3 files changed, 72 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index fdfc4b6a644..e9139053399 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -953,6 +953,29 @@ rollback:
 	return err;
 }
 
+/**
+ *	dev_set_alias - change ifalias of a device
+ *	@dev: device
+ *	@alias: name up to IFALIASZ
+ *
+ *	Set ifalias for a device,
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+	ASSERT_RTNL();
+
+	if (len >= IFALIASZ)
+		return -EINVAL;
+
+	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
+	if (!dev->ifalias)
+		return -ENOMEM;
+
+	strlcpy(dev->ifalias, alias, len+1);
+	return len;
+}
+
+
 /**
  *	netdev_features_change - device changes features
  *	@dev: device to cause notification
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c1f4e0d428c..92d6b946731 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -209,9 +209,44 @@ static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }
 
+static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t len)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	size_t count = len;
+	ssize_t ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	/* ignore trailing newline */
+	if (len >  0 && buf[len - 1] == '\n')
+		--count;
+
+	rtnl_lock();
+	ret = dev_set_alias(netdev, buf, count);
+	rtnl_unlock();
+
+	return ret < 0 ? ret : len;
+}
+
+static ssize_t show_ifalias(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	const struct net_device *netdev = to_net_dev(dev);
+	ssize_t ret = 0;
+
+	rtnl_lock();
+	if (netdev->ifalias)
+		ret = sprintf(buf, "%s\n", netdev->ifalias);
+	rtnl_unlock();
+	return ret;
+}
+
 static struct device_attribute net_class_attributes[] = {
 	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
 	__ATTR(dev_id, S_IRUGO, show_dev_id, NULL),
+	__ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias),
 	__ATTR(iflink, S_IRUGO, show_iflink, NULL),
 	__ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
 	__ATTR(features, S_IRUGO, show_features, NULL),
@@ -418,6 +453,7 @@ static void netdev_release(struct device *d)
 
 	BUG_ON(dev->reg_state != NETREG_RELEASED);
 
+	kfree(dev->ifalias);
 	kfree((char *)dev - dev->padded);
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 71edb8b3634..8862498fd4a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -586,6 +586,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
 {
 	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
 	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+	       + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
 	       + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
 	       + nla_total_size(sizeof(struct rtnl_link_ifmap))
 	       + nla_total_size(sizeof(struct rtnl_link_stats))
@@ -640,6 +641,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (txq->qdisc_sleeping)
 		NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id);
 
+	if (dev->ifalias)
+		NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias);
+
 	if (1) {
 		struct rtnl_link_ifmap map = {
 			.mem_start   = dev->mem_start,
@@ -713,6 +717,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKMODE]		= { .type = NLA_U8 },
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
+	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -853,6 +858,14 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		modified = 1;
 	}
 
+	if (tb[IFLA_IFALIAS]) {
+		err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+				    nla_len(tb[IFLA_IFALIAS]));
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+
 	if (tb[IFLA_BROADCAST]) {
 		nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
 		send_addr_notify = 1;
-- 
cgit v1.2.3


From 242f8bfefe4bed626df4e4727ac8f315d80b567a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 22 Sep 2008 22:15:30 -0700
Subject: pkt_sched: Make qdisc->gso_skb a list.

The idea is that we can use this to get rid of
->requeue().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ec0a0839ce5..5961536be60 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -45,7 +45,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
 	if (unlikely(skb->next))
-		q->gso_skb = skb;
+		__skb_queue_head(&q->requeue, skb);
 	else
 		q->ops->requeue(skb, q);
 
@@ -57,9 +57,8 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
 	struct sk_buff *skb;
 
-	if ((skb = q->gso_skb))
-		q->gso_skb = NULL;
-	else
+	skb = __skb_dequeue(&q->requeue);
+	if (!skb)
 		skb = q->dequeue(q);
 
 	return skb;
@@ -327,6 +326,7 @@ struct Qdisc noop_qdisc = {
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noop_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
+	.requeue.lock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
 };
@@ -352,6 +352,7 @@ static struct Qdisc noqueue_qdisc = {
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noqueue_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
+	.requeue.lock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
 	.dev_queue	=	&noqueue_netdev_queue,
 };
@@ -472,6 +473,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	sch->padded = (char *) sch - (char *) p;
 
 	INIT_LIST_HEAD(&sch->list);
+	skb_queue_head_init(&sch->requeue);
 	skb_queue_head_init(&sch->q);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
@@ -539,7 +541,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
-	kfree_skb(qdisc->gso_skb);
+	__skb_queue_purge(&qdisc->requeue);
 
 	kfree((char *) qdisc - qdisc->padded);
 }
-- 
cgit v1.2.3


From f0876520b0b721bedafd9cec3b1b0624ae566eee Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 22 Sep 2008 22:15:58 -0700
Subject: pkt_sched: Always use q->requeue in dev_requeue_skb().

There is no reason to call into the complicated qdiscs
just to remember the last SKB where we found the device
blocked.

The SKB is outside of the qdiscs realm at this point.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5961536be60..1b508bd1c06 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,10 +44,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	if (unlikely(skb->next))
-		__skb_queue_head(&q->requeue, skb);
-	else
-		q->ops->requeue(skb, q);
+	__skb_queue_head(&q->requeue, skb);
 
 	__netif_schedule(q);
 	return 0;
-- 
cgit v1.2.3


From ebf059821ed8a36acd706484b719d14d212ada32 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Mon, 22 Sep 2008 22:16:23 -0700
Subject: pkt_sched: Check the state of tx_queue in dequeue_skb()

Check in dequeue_skb() the state of tx_queue for requeued skb to save
on locking and re-requeuing, and possibly remove the current check in
qdisc_run(). Based on the idea of Alexander Duyck.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1b508bd1c06..5e7e0bd38fe 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -52,11 +52,21 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 
 static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
-	struct sk_buff *skb;
-
-	skb = __skb_dequeue(&q->requeue);
-	if (!skb)
+	struct sk_buff *skb = skb_peek(&q->requeue);
+
+	if (unlikely(skb)) {
+		struct net_device *dev = qdisc_dev(q);
+		struct netdev_queue *txq;
+
+		/* check the reason of requeuing without tx lock first */
+		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
+			__skb_unlink(skb, &q->requeue);
+		else
+			skb = NULL;
+	} else {
 		skb = q->dequeue(q);
+	}
 
 	return skb;
 }
-- 
cgit v1.2.3


From f72051b0674f36c960698653a0583edaec1e495e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Sep 2008 01:11:18 -0700
Subject: neigh: Remove by-hand SKB queue handling.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 9d92e41826e..1dc728b3858 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -927,8 +927,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 			if (skb_queue_len(&neigh->arp_queue) >=
 			    neigh->parms->queue_len) {
 				struct sk_buff *buff;
-				buff = neigh->arp_queue.next;
-				__skb_unlink(buff, &neigh->arp_queue);
+				buff = __skb_dequeue(&neigh->arp_queue);
 				kfree_skb(buff);
 				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
 			}
@@ -1259,24 +1258,20 @@ static void neigh_proxy_process(unsigned long arg)
 	struct neigh_table *tbl = (struct neigh_table *)arg;
 	long sched_next = 0;
 	unsigned long now = jiffies;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *n;
 
 	spin_lock(&tbl->proxy_queue.lock);
 
-	skb = tbl->proxy_queue.next;
-
-	while (skb != (struct sk_buff *)&tbl->proxy_queue) {
-		struct sk_buff *back = skb;
-		long tdif = NEIGH_CB(back)->sched_next - now;
+	skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
+		long tdif = NEIGH_CB(skb)->sched_next - now;
 
-		skb = skb->next;
 		if (tdif <= 0) {
-			struct net_device *dev = back->dev;
-			__skb_unlink(back, &tbl->proxy_queue);
+			struct net_device *dev = skb->dev;
+			__skb_unlink(skb, &tbl->proxy_queue);
 			if (tbl->proxy_redo && netif_running(dev))
-				tbl->proxy_redo(back);
+				tbl->proxy_redo(skb);
 			else
-				kfree_skb(back);
+				kfree_skb(skb);
 
 			dev_put(dev);
 		} else if (!sched_next || tdif < sched_next)
-- 
cgit v1.2.3


From 77d40a0952b16e020ce07c4cf9fb22024448275b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Sep 2008 01:29:23 -0700
Subject: tcp: Fix order of tests in tcp_retransmit_skb()

tcp_write_queue_next() must only be made if we know that
tcp_skb_is_last() evaluates to false.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c3d58ee3e16..a8499ef3234 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1932,8 +1932,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	/* Collapse two adjacent packets if worthwhile and we can. */
 	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
 	    (skb->len < (cur_mss >> 1)) &&
-	    (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
 	    (!tcp_skb_is_last(sk, skb)) &&
+	    (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
 	    (skb_shinfo(skb)->nr_frags == 0 &&
 	     skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
 	    (tcp_skb_pcount(skb) == 1 &&
-- 
cgit v1.2.3


From 28e3487b7dd8a9791baac924bc887140ec747bed Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Sep 2008 02:51:41 -0700
Subject: tcp: Fix queue traversal in tcp_use_frto().

We must check tcp_skb_is_last() before doing a tcp_write_queue_next().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cbfe13d5f42..3b76bce769d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1746,6 +1746,8 @@ int tcp_use_frto(struct sock *sk)
 		return 0;
 
 	skb = tcp_write_queue_head(sk);
+	if (tcp_skb_is_last(sk, skb))
+		return 1;
 	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
 	tcp_for_write_queue_from(skb, sk) {
 		if (skb == tcp_send_head(sk))
-- 
cgit v1.2.3


From 96ca4a2cc1454cf633a1e0796b7ef39d937b87ec Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <oliver@hartkopp.net>
Date: Tue, 23 Sep 2008 21:23:19 -0700
Subject: net: remove ifalias on empty given alias

This patch removes the potentially allocated ifalias when the (new) given alias is empty.

E.g. when setting

echo "" > /sys/class/net/eth0/ifalias

Signed-off-by: Oliver Hartkopp <oliver@hartkopp.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index e9139053399..a90737fe247 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -967,6 +967,14 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 	if (len >= IFALIASZ)
 		return -EINVAL;
 
+	if (!len) {
+		if (dev->ifalias) {
+			kfree(dev->ifalias);
+			dev->ifalias = NULL;
+		}
+		return 0;
+	}
+
 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 	if (!dev->ifalias)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 762af43bda3d8281a2738d3920ae5ded170aaf39 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 15 Sep 2008 10:30:34 +0200
Subject: cfg80211: fix static regdomains

When Luis added the static regdomains back he used +/-20
of the centre frequencies to account for 40MHz bandwidth
neglecting the fact that 40MHz bandwidth cannot be used
on the channels close to the allowed band edges.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/core.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index a910cd2d0fd..59e4d7debf0 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -41,7 +41,7 @@ const struct ieee80211_regdomain world_regdom = {
 	.n_reg_rules = 1,
 	.alpha2 =  "00",
 	.reg_rules = {
-		REG_RULE(2402, 2472, 40, 6, 20,
+		REG_RULE(2412-10, 2462+10, 40, 6, 20,
 			NL80211_RRF_PASSIVE_SCAN |
 			NL80211_RRF_NO_IBSS),
 	}
@@ -64,17 +64,17 @@ const struct ieee80211_regdomain us_regdom = {
 	.alpha2 =  "US",
 	.reg_rules = {
 		/* IEEE 802.11b/g, channels 1..11 */
-		REG_RULE(2412-20, 2462+20, 40, 6, 27, 0),
+		REG_RULE(2412-10, 2462+10, 40, 6, 27, 0),
 		/* IEEE 802.11a, channel 36 */
-		REG_RULE(5180-20, 5180+20, 40, 6, 23, 0),
+		REG_RULE(5180-10, 5180+10, 40, 6, 23, 0),
 		/* IEEE 802.11a, channel 40 */
-		REG_RULE(5200-20, 5200+20, 40, 6, 23, 0),
+		REG_RULE(5200-10, 5200+10, 40, 6, 23, 0),
 		/* IEEE 802.11a, channel 44 */
-		REG_RULE(5220-20, 5220+20, 40, 6, 23, 0),
+		REG_RULE(5220-10, 5220+10, 40, 6, 23, 0),
 		/* IEEE 802.11a, channels 48..64 */
-		REG_RULE(5240-20, 5320+20, 40, 6, 23, 0),
+		REG_RULE(5240-10, 5320+10, 40, 6, 23, 0),
 		/* IEEE 802.11a, channels 149..165, outdoor */
-		REG_RULE(5745-20, 5825+20, 40, 6, 30, 0),
+		REG_RULE(5745-10, 5825+10, 40, 6, 30, 0),
 	}
 };
 
@@ -83,12 +83,12 @@ const struct ieee80211_regdomain jp_regdom = {
 	.alpha2 =  "JP",
 	.reg_rules = {
 		/* IEEE 802.11b/g, channels 1..14 */
-		REG_RULE(2412-20, 2484+20, 40, 6, 20, 0),
+		REG_RULE(2412-10, 2484+10, 40, 6, 20, 0),
 		/* IEEE 802.11a, channels 34..48 */
-		REG_RULE(5170-20, 5240+20, 40, 6, 20,
+		REG_RULE(5170-10, 5240+10, 40, 6, 20,
 			NL80211_RRF_PASSIVE_SCAN),
 		/* IEEE 802.11a, channels 52..64 */
-		REG_RULE(5260-20, 5320+20, 40, 6, 20,
+		REG_RULE(5260-10, 5320+10, 40, 6, 20,
 			NL80211_RRF_NO_IBSS |
 			NL80211_RRF_DFS),
 	}
@@ -101,22 +101,22 @@ const struct ieee80211_regdomain eu_regdom = {
 	.alpha2 =  "EU",
 	.reg_rules = {
 		/* IEEE 802.11b/g, channels 1..13 */
-		REG_RULE(2412-20, 2472+20, 40, 6, 20, 0),
+		REG_RULE(2412-10, 2472+10, 40, 6, 20, 0),
 		/* IEEE 802.11a, channel 36 */
-		REG_RULE(5180-20, 5180+20, 40, 6, 23,
+		REG_RULE(5180-10, 5180+10, 40, 6, 23,
 			NL80211_RRF_PASSIVE_SCAN),
 		/* IEEE 802.11a, channel 40 */
-		REG_RULE(5200-20, 5200+20, 40, 6, 23,
+		REG_RULE(5200-10, 5200+10, 40, 6, 23,
 			NL80211_RRF_PASSIVE_SCAN),
 		/* IEEE 802.11a, channel 44 */
-		REG_RULE(5220-20, 5220+20, 40, 6, 23,
+		REG_RULE(5220-10, 5220+10, 40, 6, 23,
 			NL80211_RRF_PASSIVE_SCAN),
 		/* IEEE 802.11a, channels 48..64 */
-		REG_RULE(5240-20, 5320+20, 40, 6, 20,
+		REG_RULE(5240-10, 5320+10, 40, 6, 20,
 			NL80211_RRF_NO_IBSS |
 			NL80211_RRF_DFS),
 		/* IEEE 802.11a, channels 100..140 */
-		REG_RULE(5500-20, 5700+20, 40, 6, 30,
+		REG_RULE(5500-10, 5700+10, 40, 6, 30,
 			NL80211_RRF_NO_IBSS |
 			NL80211_RRF_DFS),
 	}
-- 
cgit v1.2.3


From 734366deaee05b1a5842d977960b4cc574d7551d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 15 Sep 2008 10:56:48 +0200
Subject: cfg80211: clean up regulatory mess

The recent code from Luis is an #ifdef hell and contains lots of
code that's stuffed into the wrong file making a whole bunch of
things needlessly non-static, and besides, what is it doing in
core.c??

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/core.c | 144 -----------------------------
 net/wireless/reg.c  | 254 +++++++++++++++++++++++++++++++++++++++++-----------
 net/wireless/reg.h  |  35 +-------
 3 files changed, 203 insertions(+), 230 deletions(-)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index 59e4d7debf0..88cb7339486 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -29,107 +29,6 @@ MODULE_AUTHOR("Johannes Berg");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("wireless configuration support");
 
-struct list_head regulatory_requests;
-
-/* Central wireless core regulatory domains, we only need two,
- * the current one and a world regulatory domain in case we have no
- * information to give us an alpha2 */
-struct ieee80211_regdomain *cfg80211_regdomain;
-
-/* We keep a static world regulatory domain in case of the absence of CRDA */
-const struct ieee80211_regdomain world_regdom = {
-	.n_reg_rules = 1,
-	.alpha2 =  "00",
-	.reg_rules = {
-		REG_RULE(2412-10, 2462+10, 40, 6, 20,
-			NL80211_RRF_PASSIVE_SCAN |
-			NL80211_RRF_NO_IBSS),
-	}
-};
-
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-/* All this fucking static junk will be removed soon, so
- * don't fucking count on it !@#$ */
-
-static char *ieee80211_regdom = "US";
-module_param(ieee80211_regdom, charp, 0444);
-MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
-
-/* We assume 40 MHz bandwidth for the old regulatory work.
- * We make emphasis we are using the exact same frequencies
- * as before */
-
-const struct ieee80211_regdomain us_regdom = {
-	.n_reg_rules = 6,
-	.alpha2 =  "US",
-	.reg_rules = {
-		/* IEEE 802.11b/g, channels 1..11 */
-		REG_RULE(2412-10, 2462+10, 40, 6, 27, 0),
-		/* IEEE 802.11a, channel 36 */
-		REG_RULE(5180-10, 5180+10, 40, 6, 23, 0),
-		/* IEEE 802.11a, channel 40 */
-		REG_RULE(5200-10, 5200+10, 40, 6, 23, 0),
-		/* IEEE 802.11a, channel 44 */
-		REG_RULE(5220-10, 5220+10, 40, 6, 23, 0),
-		/* IEEE 802.11a, channels 48..64 */
-		REG_RULE(5240-10, 5320+10, 40, 6, 23, 0),
-		/* IEEE 802.11a, channels 149..165, outdoor */
-		REG_RULE(5745-10, 5825+10, 40, 6, 30, 0),
-	}
-};
-
-const struct ieee80211_regdomain jp_regdom = {
-	.n_reg_rules = 3,
-	.alpha2 =  "JP",
-	.reg_rules = {
-		/* IEEE 802.11b/g, channels 1..14 */
-		REG_RULE(2412-10, 2484+10, 40, 6, 20, 0),
-		/* IEEE 802.11a, channels 34..48 */
-		REG_RULE(5170-10, 5240+10, 40, 6, 20,
-			NL80211_RRF_PASSIVE_SCAN),
-		/* IEEE 802.11a, channels 52..64 */
-		REG_RULE(5260-10, 5320+10, 40, 6, 20,
-			NL80211_RRF_NO_IBSS |
-			NL80211_RRF_DFS),
-	}
-};
-
-const struct ieee80211_regdomain eu_regdom = {
-	.n_reg_rules = 6,
-	/* This alpha2 is bogus, we leave it here just for stupid
-	 * backward compatibility */
-	.alpha2 =  "EU",
-	.reg_rules = {
-		/* IEEE 802.11b/g, channels 1..13 */
-		REG_RULE(2412-10, 2472+10, 40, 6, 20, 0),
-		/* IEEE 802.11a, channel 36 */
-		REG_RULE(5180-10, 5180+10, 40, 6, 23,
-			NL80211_RRF_PASSIVE_SCAN),
-		/* IEEE 802.11a, channel 40 */
-		REG_RULE(5200-10, 5200+10, 40, 6, 23,
-			NL80211_RRF_PASSIVE_SCAN),
-		/* IEEE 802.11a, channel 44 */
-		REG_RULE(5220-10, 5220+10, 40, 6, 23,
-			NL80211_RRF_PASSIVE_SCAN),
-		/* IEEE 802.11a, channels 48..64 */
-		REG_RULE(5240-10, 5320+10, 40, 6, 20,
-			NL80211_RRF_NO_IBSS |
-			NL80211_RRF_DFS),
-		/* IEEE 802.11a, channels 100..140 */
-		REG_RULE(5500-10, 5700+10, 40, 6, 30,
-			NL80211_RRF_NO_IBSS |
-			NL80211_RRF_DFS),
-	}
-};
-
-#endif
-
-struct ieee80211_regdomain *cfg80211_world_regdom =
-	(struct ieee80211_regdomain *) &world_regdom;
-
-LIST_HEAD(regulatory_requests);
-DEFINE_MUTEX(cfg80211_reg_mutex);
-
 /* RCU might be appropriate here since we usually
  * only read the list, and that can happen quite
  * often because we need to do it for each command */
@@ -514,34 +413,10 @@ static struct notifier_block cfg80211_netdev_notifier = {
 	.notifier_call = cfg80211_netdev_notifier_call,
 };
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-const struct ieee80211_regdomain *static_regdom(char *alpha2)
-{
-	if (alpha2[0] == 'U' && alpha2[1] == 'S')
-		return &us_regdom;
-	if (alpha2[0] == 'J' && alpha2[1] == 'P')
-		return &jp_regdom;
-	if (alpha2[0] == 'E' && alpha2[1] == 'U')
-		return &eu_regdom;
-	/* Default, as per the old rules */
-	return &us_regdom;
-}
-#endif
-
 static int cfg80211_init(void)
 {
 	int err;
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-	cfg80211_regdomain =
-		(struct ieee80211_regdomain *) static_regdom(ieee80211_regdom);
-	/* Used during reset_regdomains_static() */
-	cfg80211_world_regdom = cfg80211_regdomain;
-#else
-	cfg80211_regdomain =
-		(struct ieee80211_regdomain *) cfg80211_world_regdom;
-#endif
-
 	err = wiphy_sysfs_init();
 	if (err)
 		goto out_fail_sysfs;
@@ -560,25 +435,6 @@ static int cfg80211_init(void)
 	if (err)
 		goto out_fail_reg;
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-	printk(KERN_INFO "cfg80211: Using old static regulatory domain:\n");
-	print_regdomain_info(cfg80211_regdomain);
-	/* The old code still requests for a new regdomain and if
-	 * you have CRDA you get it updated, otherwise you get
-	 * stuck with the static values. We ignore "EU" code as
-	 * that is not a valid ISO / IEC 3166 alpha2 */
-	if (ieee80211_regdom[0] != 'E' &&
-			ieee80211_regdom[1] != 'U')
-		err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
-			ieee80211_regdom, NULL);
-#else
-	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", NULL);
-	if (err)
-		printk(KERN_ERR "cfg80211: calling CRDA failed - "
-			"unable to update world regulatory domain, "
-			"using static definition\n");
-#endif
-
 	return 0;
 
 out_fail_reg:
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 592b2e391d4..5fbeab50996 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -42,6 +42,18 @@
 #include "core.h"
 #include "reg.h"
 
+/* wiphy is set if this request's initiator is REGDOM_SET_BY_DRIVER */
+struct regulatory_request {
+	struct list_head list;
+	struct wiphy *wiphy;
+	int granted;
+	enum reg_set_by initiator;
+	char alpha2[2];
+};
+
+static LIST_HEAD(regulatory_requests);
+DEFINE_MUTEX(cfg80211_reg_mutex);
+
 /* To trigger userspace events */
 static struct platform_device *reg_pdev;
 
@@ -51,6 +63,161 @@ static u32 supported_bandwidths[] = {
 	MHZ_TO_KHZ(20),
 };
 
+static struct list_head regulatory_requests;
+
+/* Central wireless core regulatory domains, we only need two,
+ * the current one and a world regulatory domain in case we have no
+ * information to give us an alpha2 */
+static struct ieee80211_regdomain *cfg80211_regdomain;
+
+/* We keep a static world regulatory domain in case of the absence of CRDA */
+static const struct ieee80211_regdomain world_regdom = {
+	.n_reg_rules = 1,
+	.alpha2 =  "00",
+	.reg_rules = {
+		REG_RULE(2412-10, 2462+10, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS),
+	}
+};
+
+static struct ieee80211_regdomain *cfg80211_world_regdom =
+	(struct ieee80211_regdomain *) &world_regdom;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+static char *ieee80211_regdom = "US";
+module_param(ieee80211_regdom, charp, 0444);
+MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
+
+/* We assume 40 MHz bandwidth for the old regulatory work.
+ * We make emphasis we are using the exact same frequencies
+ * as before */
+
+static const struct ieee80211_regdomain us_regdom = {
+	.n_reg_rules = 6,
+	.alpha2 =  "US",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..11 */
+		REG_RULE(2412-10, 2462+10, 40, 6, 27, 0),
+		/* IEEE 802.11a, channel 36 */
+		REG_RULE(5180-10, 5180+10, 40, 6, 23, 0),
+		/* IEEE 802.11a, channel 40 */
+		REG_RULE(5200-10, 5200+10, 40, 6, 23, 0),
+		/* IEEE 802.11a, channel 44 */
+		REG_RULE(5220-10, 5220+10, 40, 6, 23, 0),
+		/* IEEE 802.11a, channels 48..64 */
+		REG_RULE(5240-10, 5320+10, 40, 6, 23, 0),
+		/* IEEE 802.11a, channels 149..165, outdoor */
+		REG_RULE(5745-10, 5825+10, 40, 6, 30, 0),
+	}
+};
+
+static const struct ieee80211_regdomain jp_regdom = {
+	.n_reg_rules = 3,
+	.alpha2 =  "JP",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..14 */
+		REG_RULE(2412-10, 2484+10, 40, 6, 20, 0),
+		/* IEEE 802.11a, channels 34..48 */
+		REG_RULE(5170-10, 5240+10, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channels 52..64 */
+		REG_RULE(5260-10, 5320+10, 40, 6, 20,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+	}
+};
+
+static const struct ieee80211_regdomain eu_regdom = {
+	.n_reg_rules = 6,
+	/* This alpha2 is bogus, we leave it here just for stupid
+	 * backward compatibility */
+	.alpha2 =  "EU",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..13 */
+		REG_RULE(2412-10, 2472+10, 40, 6, 20, 0),
+		/* IEEE 802.11a, channel 36 */
+		REG_RULE(5180-10, 5180+10, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channel 40 */
+		REG_RULE(5200-10, 5200+10, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channel 44 */
+		REG_RULE(5220-10, 5220+10, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channels 48..64 */
+		REG_RULE(5240-10, 5320+10, 40, 6, 20,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+		/* IEEE 802.11a, channels 100..140 */
+		REG_RULE(5500-10, 5700+10, 40, 6, 30,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+	}
+};
+
+static const struct ieee80211_regdomain *static_regdom(char *alpha2)
+{
+	if (alpha2[0] == 'U' && alpha2[1] == 'S')
+		return &us_regdom;
+	if (alpha2[0] == 'J' && alpha2[1] == 'P')
+		return &jp_regdom;
+	if (alpha2[0] == 'E' && alpha2[1] == 'U')
+		return &eu_regdom;
+	/* Default, as per the old rules */
+	return &us_regdom;
+}
+
+static bool is_old_static_regdom(struct ieee80211_regdomain *rd)
+{
+	if (rd == &us_regdom || rd == &jp_regdom || rd == &eu_regdom)
+		return true;
+	return false;
+}
+
+/* The old crap never deals with a world regulatory domain, it only
+ * deals with the static regulatory domain passed and if possible
+ * an updated "US" or "JP" regulatory domain. We do however store the
+ * old static regulatory domain in cfg80211_world_regdom for convenience
+ * of use here */
+static void reset_regdomains_static(void)
+{
+	if (!is_old_static_regdom(cfg80211_regdomain))
+		kfree(cfg80211_regdomain);
+	/* This is setting the regdom to the old static regdom */
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+}
+#else
+static void reset_regdomains(void)
+{
+	if (cfg80211_world_regdom && cfg80211_world_regdom != &world_regdom) {
+		if (cfg80211_world_regdom == cfg80211_regdomain) {
+			kfree(cfg80211_regdomain);
+		} else {
+			kfree(cfg80211_world_regdom);
+			kfree(cfg80211_regdomain);
+		}
+	} else if (cfg80211_regdomain && cfg80211_regdomain != &world_regdom)
+		kfree(cfg80211_regdomain);
+
+	cfg80211_world_regdom = (struct ieee80211_regdomain *) &world_regdom;
+	cfg80211_regdomain = NULL;
+}
+
+/* Dynamic world regulatory domain requested by the wireless
+ * core upon initialization */
+static void update_world_regdomain(struct ieee80211_regdomain *rd)
+{
+	BUG_ON(list_empty(&regulatory_requests));
+
+	reset_regdomains();
+
+	cfg80211_world_regdom = rd;
+	cfg80211_regdomain = rd;
+}
+#endif
+
 bool is_world_regdom(char *alpha2)
 {
 	if (!alpha2)
@@ -555,58 +722,6 @@ void print_regdomain_info(struct ieee80211_regdomain *rd)
 	print_rd_rules(rd);
 }
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-
-static bool is_old_static_regdom(struct ieee80211_regdomain *rd)
-{
-	if (rd == &us_regdom || rd == &jp_regdom || rd == &eu_regdom)
-		return true;
-	return false;
-}
-
-/* The old crap never deals with a world regulatory domain, it only
- * deals with the static regulatory domain passed and if possible
- * an updated "US" or "JP" regulatory domain. We do however store the
- * old static regulatory domain in cfg80211_world_regdom for convenience
- * of use here */
-static void reset_regdomains_static(void)
-{
-	if (!is_old_static_regdom(cfg80211_regdomain))
-		kfree(cfg80211_regdomain);
-	/* This is setting the regdom to the old static regdom */
-	cfg80211_regdomain =
-		(struct ieee80211_regdomain *) cfg80211_world_regdom;
-}
-#else
-static void reset_regdomains(void)
-{
-	if (cfg80211_world_regdom && cfg80211_world_regdom != &world_regdom) {
-		if (cfg80211_world_regdom == cfg80211_regdomain) {
-			kfree(cfg80211_regdomain);
-		} else {
-			kfree(cfg80211_world_regdom);
-			kfree(cfg80211_regdomain);
-		}
-	} else if (cfg80211_regdomain && cfg80211_regdomain != &world_regdom)
-		kfree(cfg80211_regdomain);
-
-	cfg80211_world_regdom = (struct ieee80211_regdomain *) &world_regdom;
-	cfg80211_regdomain = NULL;
-}
-
-/* Dynamic world regulatory domain requested by the wireless
- * core upon initialization */
-static void update_world_regdomain(struct ieee80211_regdomain *rd)
-{
-	BUG_ON(list_empty(&regulatory_requests));
-
-	reset_regdomains();
-
-	cfg80211_world_regdom = rd;
-	cfg80211_regdomain = rd;
-}
-#endif
-
 static int __set_regdom(struct ieee80211_regdomain *rd)
 {
 	struct regulatory_request *request = NULL;
@@ -615,7 +730,7 @@ static int __set_regdom(struct ieee80211_regdomain *rd)
 
 #ifdef CONFIG_WIRELESS_OLD_REGULATORY
 	/* We ignore the world regdom with the old static regdomains setup
-	 * as there is no point to it with satic regulatory definitions :(
+	 * as there is no point to it with static regulatory definitions :(
 	 * Don't worry this shit will be removed soon... */
 	if (is_world_regdom(rd->alpha2))
 		return -EINVAL;
@@ -735,25 +850,58 @@ int set_regdom(struct ieee80211_regdomain *rd)
 
 int regulatory_init(void)
 {
+	int err;
+
 	reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
 	if (IS_ERR(reg_pdev))
 		return PTR_ERR(reg_pdev);
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) static_regdom(ieee80211_regdom);
+	/* Used during reset_regdomains_static() */
+	cfg80211_world_regdom = cfg80211_regdomain;
+
+	printk(KERN_INFO "cfg80211: Using old static regulatory domain:\n");
+	print_regdomain_info(cfg80211_regdomain);
+	/* The old code still requests for a new regdomain and if
+	 * you have CRDA you get it updated, otherwise you get
+	 * stuck with the static values. We ignore "EU" code as
+	 * that is not a valid ISO / IEC 3166 alpha2 */
+	if (ieee80211_regdom[0] != 'E' && ieee80211_regdom[1] != 'U')
+		err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
+					ieee80211_regdom, NULL);
+#else
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+
+	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", NULL);
+	if (err)
+		printk(KERN_ERR "cfg80211: calling CRDA failed - "
+		       "unable to update world regulatory domain, "
+		       "using static definition\n");
+#endif
+
 	return 0;
 }
 
 void regulatory_exit(void)
 {
 	struct regulatory_request *req, *req_tmp;
+
 	mutex_lock(&cfg80211_drv_mutex);
+
 #ifdef CONFIG_WIRELESS_OLD_REGULATORY
 	reset_regdomains_static();
 #else
 	reset_regdomains();
 #endif
+
 	list_for_each_entry_safe(req, req_tmp, &regulatory_requests, list) {
 		list_del(&req->list);
 		kfree(req);
 	}
 	platform_device_unregister(reg_pdev);
+
 	mutex_unlock(&cfg80211_drv_mutex);
 }
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index d75fd023297..b169815987f 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -1,44 +1,13 @@
 #ifndef __NET_WIRELESS_REG_H
 #define __NET_WIRELESS_REG_H
 
-extern const struct ieee80211_regdomain world_regdom;
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-extern const struct ieee80211_regdomain us_regdom;
-extern const struct ieee80211_regdomain jp_regdom;
-extern const struct ieee80211_regdomain eu_regdom;
-#endif
-
-extern struct ieee80211_regdomain *cfg80211_regdomain;
-extern struct ieee80211_regdomain *cfg80211_world_regdom;
-extern struct list_head regulatory_requests;
-
-struct regdom_last_setby {
-	struct wiphy *wiphy;
-	u8 initiator;
-};
-
-/* wiphy is set if this request's initiator is REGDOM_SET_BY_DRIVER */
-struct regulatory_request {
-	struct list_head list;
-	struct wiphy *wiphy;
-	int granted;
-	enum reg_set_by initiator;
-	char alpha2[2];
-};
-
+extern struct mutex cfg80211_reg_mutex;
 bool is_world_regdom(char *alpha2);
 bool reg_is_valid_request(char *alpha2);
 
-int set_regdom(struct ieee80211_regdomain *rd);
-int __regulatory_hint_alpha2(struct wiphy *wiphy, enum reg_set_by set_by,
-		      const char *alpha2);
-
 int regulatory_init(void);
 void regulatory_exit(void);
 
-void print_regdomain_info(struct ieee80211_regdomain *);
-
-/* If a char is A-Z */
-#define IS_ALPHA(letter) (letter >= 65 && letter <= 90)
+int set_regdom(struct ieee80211_regdomain *rd);
 
 #endif  /* __NET_WIRELESS_REG_H */
-- 
cgit v1.2.3


From a3d2eaf0dcad6dfdf44f3093aef688dfca714b6c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 15 Sep 2008 11:10:52 +0200
Subject: cfg80211: fix regulatory code const

A few pointers and structures in the regulatory code are const,
but because it wasn't done properly a whole bunch of bogus
casts were needed to compile without warning. Mark everything
const properly to avoid that kind of junk code.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/reg.c | 58 ++++++++++++++++++++++++++----------------------------
 net/wireless/reg.h |  6 +++---
 2 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5fbeab50996..7aba46efc7d 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -68,7 +68,7 @@ static struct list_head regulatory_requests;
 /* Central wireless core regulatory domains, we only need two,
  * the current one and a world regulatory domain in case we have no
  * information to give us an alpha2 */
-static struct ieee80211_regdomain *cfg80211_regdomain;
+static const struct ieee80211_regdomain *cfg80211_regdomain;
 
 /* We keep a static world regulatory domain in case of the absence of CRDA */
 static const struct ieee80211_regdomain world_regdom = {
@@ -81,8 +81,8 @@ static const struct ieee80211_regdomain world_regdom = {
 	}
 };
 
-static struct ieee80211_regdomain *cfg80211_world_regdom =
-	(struct ieee80211_regdomain *) &world_regdom;
+static const struct ieee80211_regdomain *cfg80211_world_regdom =
+	&world_regdom;
 
 #ifdef CONFIG_WIRELESS_OLD_REGULATORY
 static char *ieee80211_regdom = "US";
@@ -168,7 +168,7 @@ static const struct ieee80211_regdomain *static_regdom(char *alpha2)
 	return &us_regdom;
 }
 
-static bool is_old_static_regdom(struct ieee80211_regdomain *rd)
+static bool is_old_static_regdom(const struct ieee80211_regdomain *rd)
 {
 	if (rd == &us_regdom || rd == &jp_regdom || rd == &eu_regdom)
 		return true;
@@ -201,13 +201,13 @@ static void reset_regdomains(void)
 	} else if (cfg80211_regdomain && cfg80211_regdomain != &world_regdom)
 		kfree(cfg80211_regdomain);
 
-	cfg80211_world_regdom = (struct ieee80211_regdomain *) &world_regdom;
+	cfg80211_world_regdom = &world_regdom;
 	cfg80211_regdomain = NULL;
 }
 
 /* Dynamic world regulatory domain requested by the wireless
  * core upon initialization */
-static void update_world_regdomain(struct ieee80211_regdomain *rd)
+static void update_world_regdomain(const struct ieee80211_regdomain *rd)
 {
 	BUG_ON(list_empty(&regulatory_requests));
 
@@ -218,7 +218,7 @@ static void update_world_regdomain(struct ieee80211_regdomain *rd)
 }
 #endif
 
-bool is_world_regdom(char *alpha2)
+bool is_world_regdom(const char *alpha2)
 {
 	if (!alpha2)
 		return false;
@@ -227,7 +227,7 @@ bool is_world_regdom(char *alpha2)
 	return false;
 }
 
-static bool is_alpha2_set(char *alpha2)
+static bool is_alpha2_set(const char *alpha2)
 {
 	if (!alpha2)
 		return false;
@@ -244,7 +244,7 @@ static bool is_alpha_upper(char letter)
 	return false;
 }
 
-static bool is_unknown_alpha2(char *alpha2)
+static bool is_unknown_alpha2(const char *alpha2)
 {
 	if (!alpha2)
 		return false;
@@ -255,7 +255,7 @@ static bool is_unknown_alpha2(char *alpha2)
 	return false;
 }
 
-static bool is_an_alpha2(char *alpha2)
+static bool is_an_alpha2(const char *alpha2)
 {
 	if (!alpha2)
 		return false;
@@ -264,7 +264,7 @@ static bool is_an_alpha2(char *alpha2)
 	return false;
 }
 
-static bool alpha2_equal(char *alpha2_x, char *alpha2_y)
+static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y)
 {
 	if (!alpha2_x || !alpha2_y)
 		return false;
@@ -274,7 +274,7 @@ static bool alpha2_equal(char *alpha2_x, char *alpha2_y)
 	return false;
 }
 
-static bool regdom_changed(char *alpha2)
+static bool regdom_changed(const char *alpha2)
 {
 	if (!cfg80211_regdomain)
 		return true;
@@ -405,7 +405,7 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
 	}
 }
 
-static bool __reg_is_valid_request(char *alpha2,
+static bool __reg_is_valid_request(const char *alpha2,
 	struct regulatory_request **request)
 {
 	struct regulatory_request *req;
@@ -421,16 +421,16 @@ static bool __reg_is_valid_request(char *alpha2,
 }
 
 /* Used by nl80211 before kmalloc'ing our regulatory domain */
-bool reg_is_valid_request(char *alpha2)
+bool reg_is_valid_request(const char *alpha2)
 {
 	struct regulatory_request *request = NULL;
 	return  __reg_is_valid_request(alpha2, &request);
 }
 
 /* Sanity check on a regulatory rule */
-static bool is_valid_reg_rule(struct ieee80211_reg_rule *rule)
+static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
 {
-	struct ieee80211_freq_range *freq_range = &rule->freq_range;
+	const struct ieee80211_freq_range *freq_range = &rule->freq_range;
 	u32 freq_diff;
 
 	if (freq_range->start_freq_khz == 0 || freq_range->end_freq_khz == 0)
@@ -447,9 +447,9 @@ static bool is_valid_reg_rule(struct ieee80211_reg_rule *rule)
 	return true;
 }
 
-static bool is_valid_rd(struct ieee80211_regdomain *rd)
+static bool is_valid_rd(const struct ieee80211_regdomain *rd)
 {
-	struct ieee80211_reg_rule *reg_rule = NULL;
+	const struct ieee80211_reg_rule *reg_rule = NULL;
 	unsigned int i;
 
 	if (!rd->n_reg_rules)
@@ -661,12 +661,12 @@ unlock_and_exit:
 EXPORT_SYMBOL(regulatory_hint);
 
 
-static void print_rd_rules(struct ieee80211_regdomain *rd)
+static void print_rd_rules(const struct ieee80211_regdomain *rd)
 {
 	unsigned int i;
-	struct ieee80211_reg_rule *reg_rule = NULL;
-	struct ieee80211_freq_range *freq_range = NULL;
-	struct ieee80211_power_rule *power_rule = NULL;
+	const struct ieee80211_reg_rule *reg_rule = NULL;
+	const struct ieee80211_freq_range *freq_range = NULL;
+	const struct ieee80211_power_rule *power_rule = NULL;
 
 	printk(KERN_INFO "\t(start_freq - end_freq @ bandwidth), "
 		"(max_antenna_gain, max_eirp)\n");
@@ -696,7 +696,7 @@ static void print_rd_rules(struct ieee80211_regdomain *rd)
 	}
 }
 
-static void print_regdomain(struct ieee80211_regdomain *rd)
+static void print_regdomain(const struct ieee80211_regdomain *rd)
 {
 
 	if (is_world_regdom(rd->alpha2))
@@ -715,14 +715,14 @@ static void print_regdomain(struct ieee80211_regdomain *rd)
 	print_rd_rules(rd);
 }
 
-void print_regdomain_info(struct ieee80211_regdomain *rd)
+void print_regdomain_info(const struct ieee80211_regdomain *rd)
 {
 	printk(KERN_INFO "cfg80211: Regulatory domain: %c%c\n",
 		rd->alpha2[0], rd->alpha2[1]);
 	print_rd_rules(rd);
 }
 
-static int __set_regdom(struct ieee80211_regdomain *rd)
+static int __set_regdom(const struct ieee80211_regdomain *rd)
 {
 	struct regulatory_request *request = NULL;
 
@@ -804,7 +804,7 @@ static int __set_regdom(struct ieee80211_regdomain *rd)
  * multiple drivers can be ironed out later. Caller must've already
  * kmalloc'd the rd structure. If this calls fails you should kfree()
  * the passed rd. Caller must hold cfg80211_drv_mutex */
-int set_regdom(struct ieee80211_regdomain *rd)
+int set_regdom(const struct ieee80211_regdomain *rd)
 {
 	struct regulatory_request *this_request = NULL, *prev_request = NULL;
 	int r;
@@ -857,8 +857,7 @@ int regulatory_init(void)
 		return PTR_ERR(reg_pdev);
 
 #ifdef CONFIG_WIRELESS_OLD_REGULATORY
-	cfg80211_regdomain =
-		(struct ieee80211_regdomain *) static_regdom(ieee80211_regdom);
+	cfg80211_regdomain = static_regdom(ieee80211_regdom);
 	/* Used during reset_regdomains_static() */
 	cfg80211_world_regdom = cfg80211_regdomain;
 
@@ -872,8 +871,7 @@ int regulatory_init(void)
 		err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
 					ieee80211_regdom, NULL);
 #else
-	cfg80211_regdomain =
-		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+	cfg80211_regdomain = cfg80211_world_regdom;
 
 	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", NULL);
 	if (err)
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index b169815987f..a33362872f3 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -2,12 +2,12 @@
 #define __NET_WIRELESS_REG_H
 
 extern struct mutex cfg80211_reg_mutex;
-bool is_world_regdom(char *alpha2);
-bool reg_is_valid_request(char *alpha2);
+bool is_world_regdom(const char *alpha2);
+bool reg_is_valid_request(const char *alpha2);
 
 int regulatory_init(void);
 void regulatory_exit(void);
 
-int set_regdom(struct ieee80211_regdomain *rd);
+int set_regdom(const struct ieee80211_regdomain *rd);
 
 #endif  /* __NET_WIRELESS_REG_H */
-- 
cgit v1.2.3


From 942b25cf9028e7c2f6446ee7c6618bd70dafec5f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 15 Sep 2008 11:26:47 +0200
Subject: cfg80211: clean up static regdomain mess

The statically defined regdomains are used in a very convoluted
way, use them instead to prime the information we have and then
continue operating normally.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/reg.c | 73 +++++++++++++++---------------------------------------
 1 file changed, 20 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 7aba46efc7d..626dbb68849 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -174,32 +174,27 @@ static bool is_old_static_regdom(const struct ieee80211_regdomain *rd)
 		return true;
 	return false;
 }
-
-/* The old crap never deals with a world regulatory domain, it only
- * deals with the static regulatory domain passed and if possible
- * an updated "US" or "JP" regulatory domain. We do however store the
- * old static regulatory domain in cfg80211_world_regdom for convenience
- * of use here */
-static void reset_regdomains_static(void)
+#else
+static inline bool is_old_static_regdom(const struct ieee80211_regdomain *rd)
 {
-	if (!is_old_static_regdom(cfg80211_regdomain))
-		kfree(cfg80211_regdomain);
-	/* This is setting the regdom to the old static regdom */
-	cfg80211_regdomain =
-		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+	return false;
 }
-#else
+#endif
+
 static void reset_regdomains(void)
 {
-	if (cfg80211_world_regdom && cfg80211_world_regdom != &world_regdom) {
-		if (cfg80211_world_regdom == cfg80211_regdomain) {
-			kfree(cfg80211_regdomain);
-		} else {
-			kfree(cfg80211_world_regdom);
-			kfree(cfg80211_regdomain);
-		}
-	} else if (cfg80211_regdomain && cfg80211_regdomain != &world_regdom)
-		kfree(cfg80211_regdomain);
+	/* avoid freeing static information or freeing something twice */
+	if (cfg80211_regdomain == cfg80211_world_regdom)
+		cfg80211_regdomain = NULL;
+	if (cfg80211_world_regdom == &world_regdom)
+		cfg80211_world_regdom = NULL;
+	if (cfg80211_regdomain == &world_regdom)
+		cfg80211_regdomain = NULL;
+	if (is_old_static_regdom(cfg80211_regdomain))
+		cfg80211_regdomain = NULL;
+
+	kfree(cfg80211_regdomain);
+	kfree(cfg80211_world_regdom);
 
 	cfg80211_world_regdom = &world_regdom;
 	cfg80211_regdomain = NULL;
@@ -216,7 +211,6 @@ static void update_world_regdomain(const struct ieee80211_regdomain *rd)
 	cfg80211_world_regdom = rd;
 	cfg80211_regdomain = rd;
 }
-#endif
 
 bool is_world_regdom(const char *alpha2)
 {
@@ -297,12 +291,8 @@ static int call_crda(const char *alpha2)
 		printk(KERN_INFO "cfg80211: Calling CRDA for country: %c%c\n",
 			alpha2[0], alpha2[1]);
 	else
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-		return -EINVAL;
-#else
 		printk(KERN_INFO "cfg80211: Calling CRDA to update world "
 			"regulatory domain\n");
-#endif
 
 	country_env[8] = alpha2[0];
 	country_env[9] = alpha2[1];
@@ -728,20 +718,12 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 
 	/* Some basic sanity checks first */
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-	/* We ignore the world regdom with the old static regdomains setup
-	 * as there is no point to it with static regulatory definitions :(
-	 * Don't worry this shit will be removed soon... */
-	if (is_world_regdom(rd->alpha2))
-		return -EINVAL;
-#else
 	if (is_world_regdom(rd->alpha2)) {
 		if (WARN_ON(!__reg_is_valid_request(rd->alpha2, &request)))
 			return -EINVAL;
 		update_world_regdomain(rd);
 		return 0;
 	}
-#endif
 
 	if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) &&
 			!is_unknown_alpha2(rd->alpha2))
@@ -750,15 +732,10 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 	if (list_empty(&regulatory_requests))
 		return -EINVAL;
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-	/* Static "US" and "JP" will be overridden, but just once */
+	/* allow overriding the static definitions if CRDA is present */
 	if (!is_old_static_regdom(cfg80211_regdomain) &&
-			!regdom_changed(rd->alpha2))
-		return -EINVAL;
-#else
-	if (!regdom_changed(rd->alpha2))
+	    !regdom_changed(rd->alpha2))
 		return -EINVAL;
-#endif
 
 	/* Now lets set the regulatory domain, update all driver channels
 	 * and finally inform them of what we have done, in case they want
@@ -768,11 +745,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 	if (WARN_ON(!__reg_is_valid_request(rd->alpha2, &request)))
 		return -EINVAL;
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-	reset_regdomains_static();
-#else
 	reset_regdomains();
-#endif
 
 	/* Country IE parsing coming soon */
 	switch (request->initiator) {
@@ -858,10 +831,8 @@ int regulatory_init(void)
 
 #ifdef CONFIG_WIRELESS_OLD_REGULATORY
 	cfg80211_regdomain = static_regdom(ieee80211_regdom);
-	/* Used during reset_regdomains_static() */
-	cfg80211_world_regdom = cfg80211_regdomain;
 
-	printk(KERN_INFO "cfg80211: Using old static regulatory domain:\n");
+	printk(KERN_INFO "cfg80211: Using static regulatory domain info\n");
 	print_regdomain_info(cfg80211_regdomain);
 	/* The old code still requests for a new regdomain and if
 	 * you have CRDA you get it updated, otherwise you get
@@ -889,11 +860,7 @@ void regulatory_exit(void)
 
 	mutex_lock(&cfg80211_drv_mutex);
 
-#ifdef CONFIG_WIRELESS_OLD_REGULATORY
-	reset_regdomains_static();
-#else
 	reset_regdomains();
-#endif
 
 	list_for_each_entry_safe(req, req_tmp, &regulatory_requests, list) {
 		list_del(&req->list);
-- 
cgit v1.2.3


From 133b822638ff01eb1e32e1917b197c40ed095ddd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 16 Sep 2008 14:18:59 +0200
Subject: mac80211: make master iface not wireless

There's no need to register the master netdev with cfg80211,
in fact, this is quite dangerous and lead to having to add
checks for the master interface all over the config handlers.
This patch removes the "ieee80211_ptr" from the master iface
in favour of having a small netdev_priv() associated with
the master interface that stores the ieee80211_local pointer.
Because of this, a lot of code in the configuration handlers
can go away. To make this patch easier to verify I have also
removed a number of wiphy_priv() calls in favour of getting
the sdata first and then the local pointer from that.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c         | 59 ----------------------------------------------
 net/mac80211/debugfs_sta.c |  3 +--
 net/mac80211/ieee80211_i.h |  4 ++++
 net/mac80211/main.c        | 19 ++++++++-------
 net/mac80211/mesh.c        |  2 +-
 net/mac80211/rx.c          | 25 ++++++++------------
 net/mac80211/tx.c          | 26 ++++++++++----------
 7 files changed, 39 insertions(+), 99 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e2574885db4..a8501f14b16 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -82,7 +82,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 				  enum nl80211_iftype type, u32 *flags,
 				  struct vif_params *params)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct net_device *dev;
 	struct ieee80211_sub_if_data *sdata;
 	int ret;
@@ -95,9 +94,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 	if (!nl80211_type_check(type))
 		return -EINVAL;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	ret = ieee80211_if_change_type(sdata, type);
@@ -120,16 +116,12 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 			     u8 key_idx, u8 *mac_addr,
 			     struct key_params *params)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct sta_info *sta = NULL;
 	enum ieee80211_key_alg alg;
 	struct ieee80211_key *key;
 	int err;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	switch (params->cipher) {
@@ -174,14 +166,10 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
 			     u8 key_idx, u8 *mac_addr)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct sta_info *sta;
 	int ret;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	rcu_read_lock();
@@ -222,7 +210,6 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 			     void (*callback)(void *cookie,
 					      struct key_params *params))
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct sta_info *sta = NULL;
 	u8 seq[6] = {0};
@@ -232,9 +219,6 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 	u16 iv16;
 	int err = -ENOENT;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	rcu_read_lock();
@@ -310,12 +294,8 @@ static int ieee80211_config_default_key(struct wiphy *wiphy,
 					struct net_device *dev,
 					u8 key_idx)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	rcu_read_lock();
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -496,13 +476,9 @@ static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata,
 static int ieee80211_add_beacon(struct wiphy *wiphy, struct net_device *dev,
 				struct beacon_parameters *params)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct beacon_data *old;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (sdata->vif.type != NL80211_IFTYPE_AP)
@@ -519,13 +495,9 @@ static int ieee80211_add_beacon(struct wiphy *wiphy, struct net_device *dev,
 static int ieee80211_set_beacon(struct wiphy *wiphy, struct net_device *dev,
 				struct beacon_parameters *params)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct beacon_data *old;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (sdata->vif.type != NL80211_IFTYPE_AP)
@@ -541,13 +513,9 @@ static int ieee80211_set_beacon(struct wiphy *wiphy, struct net_device *dev,
 
 static int ieee80211_del_beacon(struct wiphy *wiphy, struct net_device *dev)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct beacon_data *old;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (sdata->vif.type != NL80211_IFTYPE_AP)
@@ -695,9 +663,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata;
 	int err;
 
-	if (dev == local->mdev || params->vlan == local->mdev)
-		return -EOPNOTSUPP;
-
 	/* Prevent a race with changing the rate control algorithm */
 	if (!netif_running(dev))
 		return -ENETDOWN;
@@ -752,9 +717,6 @@ static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata;
 	struct sta_info *sta;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (mac) {
@@ -786,9 +748,6 @@ static int ieee80211_change_station(struct wiphy *wiphy,
 	struct sta_info *sta;
 	struct ieee80211_sub_if_data *vlansdata;
 
-	if (dev == local->mdev || params->vlan == local->mdev)
-		return -EOPNOTSUPP;
-
 	rcu_read_lock();
 
 	/* XXX: get sta belonging to dev */
@@ -828,9 +787,6 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev,
 	struct sta_info *sta;
 	int err;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	if (!netif_running(dev))
 		return -ENETDOWN;
 
@@ -884,9 +840,6 @@ static int ieee80211_change_mpath(struct wiphy *wiphy,
 	struct mesh_path *mpath;
 	struct sta_info *sta;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	if (!netif_running(dev))
 		return -ENETDOWN;
 
@@ -958,13 +911,9 @@ static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
 			       u8 *dst, u8 *next_hop, struct mpath_info *pinfo)
 
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct mesh_path *mpath;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
@@ -986,13 +935,9 @@ static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev,
 				 int idx, u8 *dst, u8 *next_hop,
 				 struct mpath_info *pinfo)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	struct mesh_path *mpath;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
@@ -1015,13 +960,9 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 				struct net_device *dev,
 				struct bss_parameters *params)
 {
-	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
 	u32 changed = 0;
 
-	if (dev == local->mdev)
-		return -EOPNOTSUPP;
-
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (sdata->vif.type != NL80211_IFTYPE_AP)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 81f350eaf8a..b9902e425f0 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -173,8 +173,7 @@ static ssize_t sta_agg_status_write(struct file *file,
 		const char __user *user_buf, size_t count, loff_t *ppos)
 {
 	struct sta_info *sta = file->private_data;
-	struct net_device *dev = sta->sdata->dev;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_local *local = sta->sdata->local;
 	struct ieee80211_hw *hw = &local->hw;
 	u8 *da = sta->sta.addr;
 	static int tid_static_tx[16] = {0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 3912fba6d3d..0b25b0f46b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -573,6 +573,10 @@ enum {
 /* maximum number of hardware queues we support. */
 #define QD_MAX_QUEUES (IEEE80211_MAX_AMPDU_QUEUES + IEEE80211_MAX_QUEUES)
 
+struct ieee80211_master_priv {
+	struct ieee80211_local *local;
+};
+
 struct ieee80211_local {
 	/* embed the driver visible part.
 	 * don't cast (use the static inlines below), but we keep
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index c307dba7ec0..7d2d5a041e2 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -106,7 +106,8 @@ static const struct header_ops ieee80211_header_ops = {
 
 static int ieee80211_master_open(struct net_device *dev)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
+	struct ieee80211_local *local = mpriv->local;
 	struct ieee80211_sub_if_data *sdata;
 	int res = -EOPNOTSUPP;
 
@@ -128,7 +129,8 @@ static int ieee80211_master_open(struct net_device *dev)
 
 static int ieee80211_master_stop(struct net_device *dev)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
+	struct ieee80211_local *local = mpriv->local;
 	struct ieee80211_sub_if_data *sdata;
 
 	/* we hold the RTNL here so can safely walk the list */
@@ -141,7 +143,8 @@ static int ieee80211_master_stop(struct net_device *dev)
 
 static void ieee80211_master_set_multicast_list(struct net_device *dev)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
+	struct ieee80211_local *local = mpriv->local;
 
 	ieee80211_configure_filter(local);
 }
@@ -787,7 +790,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	int result;
 	enum ieee80211_band band;
 	struct net_device *mdev;
-	struct wireless_dev *mwdev;
+	struct ieee80211_master_priv *mpriv;
 
 	/*
 	 * generic code guarantees at least one band,
@@ -829,16 +832,14 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	if (hw->queues < 4)
 		hw->ampdu_queues = 0;
 
-	mdev = alloc_netdev_mq(sizeof(struct wireless_dev),
+	mdev = alloc_netdev_mq(sizeof(struct ieee80211_master_priv),
 			       "wmaster%d", ether_setup,
 			       ieee80211_num_queues(hw));
 	if (!mdev)
 		goto fail_mdev_alloc;
 
-	mwdev = netdev_priv(mdev);
-	mdev->ieee80211_ptr = mwdev;
-	mwdev->wiphy = local->hw.wiphy;
-
+	mpriv = netdev_priv(mdev);
+	mpriv->local = local;
 	local->mdev = mdev;
 
 	ieee80211_rx_bss_list_init(local);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 30cf891fd3a..8013277924f 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -351,7 +351,7 @@ static void ieee80211_mesh_path_timer(unsigned long data)
 	struct ieee80211_sub_if_data *sdata =
 		(struct ieee80211_sub_if_data *) data;
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
-	struct ieee80211_local *local = wdev_priv(&sdata->wdev);
+	struct ieee80211_local *local = sdata->local;
 
 	queue_work(local->hw.workqueue, &ifmsh->work);
 }
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 92d898b901e..3ab9670f180 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -650,32 +650,28 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	return result;
 }
 
-static void ap_sta_ps_start(struct net_device *dev, struct sta_info *sta)
+static void ap_sta_ps_start(struct sta_info *sta)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	DECLARE_MAC_BUF(mac);
 
-	sdata = sta->sdata;
-
 	atomic_inc(&sdata->bss->num_sta_ps);
 	set_and_clear_sta_flags(sta, WLAN_STA_PS, WLAN_STA_PSPOLL);
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 	printk(KERN_DEBUG "%s: STA %s aid %d enters power save mode\n",
-	       dev->name, print_mac(mac, sta->sta.addr), sta->sta.aid);
+	       sdata->dev->name, print_mac(mac, sta->sta.addr), sta->sta.aid);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 }
 
-static int ap_sta_ps_end(struct net_device *dev, struct sta_info *sta)
+static int ap_sta_ps_end(struct sta_info *sta)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	int sent = 0;
-	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_tx_info *info;
 	DECLARE_MAC_BUF(mac);
 
-	sdata = sta->sdata;
-
 	atomic_dec(&sdata->bss->num_sta_ps);
 
 	clear_sta_flags(sta, WLAN_STA_PS | WLAN_STA_PSPOLL);
@@ -685,7 +681,7 @@ static int ap_sta_ps_end(struct net_device *dev, struct sta_info *sta)
 
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 	printk(KERN_DEBUG "%s: STA %s aid %d exits power save mode\n",
-	       dev->name, print_mac(mac, sta->sta.addr), sta->sta.aid);
+	       sdata->dev->name, print_mac(mac, sta->sta.addr), sta->sta.aid);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 
 	/* Send all buffered frames to the station */
@@ -701,7 +697,7 @@ static int ap_sta_ps_end(struct net_device *dev, struct sta_info *sta)
 		sent++;
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 		printk(KERN_DEBUG "%s: STA %s aid %d send PS frame "
-		       "since STA not sleeping anymore\n", dev->name,
+		       "since STA not sleeping anymore\n", sdata->dev->name,
 		       print_mac(mac, sta->sta.addr), sta->sta.aid);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 		info->flags |= IEEE80211_TX_CTL_REQUEUE;
@@ -715,7 +711,6 @@ static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 {
 	struct sta_info *sta = rx->sta;
-	struct net_device *dev = rx->dev;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
 
 	if (!sta)
@@ -757,10 +752,10 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 		 * exchange sequence */
 		if (test_sta_flags(sta, WLAN_STA_PS) &&
 		    !ieee80211_has_pm(hdr->frame_control))
-			rx->sent_ps_buffered += ap_sta_ps_end(dev, sta);
+			rx->sent_ps_buffered += ap_sta_ps_end(sta);
 		else if (!test_sta_flags(sta, WLAN_STA_PS) &&
 			 ieee80211_has_pm(hdr->frame_control))
-			ap_sta_ps_start(dev, sta);
+			ap_sta_ps_start(sta);
 	}
 
 	/* Drop data::nullfunc frames silently, since they are used only to
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 20d683641b4..00d798cc9e0 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -165,11 +165,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr,
 	return cpu_to_le16(dur);
 }
 
-static int inline is_ieee80211_device(struct net_device *dev,
-				      struct net_device *master)
+static int inline is_ieee80211_device(struct ieee80211_local *local,
+				      struct net_device *dev)
 {
-	return (wdev_priv(dev->ieee80211_ptr) ==
-		wdev_priv(master->ieee80211_ptr));
+	return local == wdev_priv(dev->ieee80211_ptr);
 }
 
 /* tx handlers */
@@ -1001,14 +1000,14 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 /*
  * NB: @tx is uninitialised when passed in here
  */
-static int ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
-				struct sk_buff *skb,
-				struct net_device *mdev)
+static int ieee80211_tx_prepare(struct ieee80211_local *local,
+				struct ieee80211_tx_data *tx,
+				struct sk_buff *skb)
 {
 	struct net_device *dev;
 
 	dev = dev_get_by_index(&init_net, skb->iif);
-	if (unlikely(dev && !is_ieee80211_device(dev, mdev))) {
+	if (unlikely(dev && !is_ieee80211_device(local, dev))) {
 		dev_put(dev);
 		dev = NULL;
 	}
@@ -1258,6 +1257,8 @@ static int ieee80211_skb_resize(struct ieee80211_local *local,
 int ieee80211_master_start_xmit(struct sk_buff *skb,
 				struct net_device *dev)
 {
+	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
+	struct ieee80211_local *local = mpriv->local;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct net_device *odev = NULL;
@@ -1273,7 +1274,7 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 
 	if (skb->iif)
 		odev = dev_get_by_index(&init_net, skb->iif);
-	if (unlikely(odev && !is_ieee80211_device(odev, dev))) {
+	if (unlikely(odev && !is_ieee80211_device(local, odev))) {
 		dev_put(odev);
 		odev = NULL;
 	}
@@ -1449,8 +1450,8 @@ fail:
 int ieee80211_subif_start_xmit(struct sk_buff *skb,
 			       struct net_device *dev)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 	int ret = 1, head_need;
 	u16 ethertype, hdrlen,  meshhdrlen = 0;
 	__le16 fc;
@@ -1462,7 +1463,6 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 	struct sta_info *sta;
 	u32 sta_flags = 0;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	if (unlikely(skb->len < ETH_HLEN)) {
 		ret = 0;
 		goto fail;
@@ -2032,7 +2032,7 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw,
 				cpu_to_le16(IEEE80211_FCTL_MOREDATA);
 		}
 
-		if (!ieee80211_tx_prepare(&tx, skb, local->mdev))
+		if (!ieee80211_tx_prepare(local, &tx, skb))
 			break;
 		dev_kfree_skb_any(skb);
 	}
-- 
cgit v1.2.3


From 60719ffd721f6764b7d07ca188c0d944a4330b69 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 16 Sep 2008 14:55:09 +0200
Subject: cfg80211: show interface type

This patch makes cfg80211 show the interface in the nl80211
information about a specific interface. API users are required
to keep the type updated (everything else is fairly complicated)
but you will get a warning if you fail to keep it updated.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/iface.c   | 1 +
 net/wireless/core.c    | 2 ++
 net/wireless/nl80211.c | 6 +++++-
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index a72fbebb8ea..b5cd91e8971 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -625,6 +625,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	/* and set some type-dependent values */
 	sdata->vif.type = type;
 	sdata->dev->hard_start_xmit = ieee80211_subif_start_xmit;
+	sdata->wdev.iftype = type;
 
 	/* only monitor differs */
 	sdata->dev->type = ARPHRD_ETHER;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 88cb7339486..d6940085d59 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -384,6 +384,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
 
 	rdev = wiphy_to_dev(dev->ieee80211_ptr->wiphy);
 
+	WARN_ON(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_UNSPECIFIED);
+
 	switch (state) {
 	case NETDEV_REGISTER:
 		mutex_lock(&rdev->devlist_mtx);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1221d726ed5..44771a690d5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -299,7 +299,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 
 	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
 	NLA_PUT_STRING(msg, NL80211_ATTR_IFNAME, dev->name);
-	/* TODO: interface type */
+	NLA_PUT_U32(msg, NL80211_ATTR_IFTYPE, dev->ieee80211_ptr->iftype);
 	return genlmsg_end(msg, hdr);
 
  nla_put_failure:
@@ -453,6 +453,10 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 				  &flags);
 	err = drv->ops->change_virtual_intf(&drv->wiphy, ifindex,
 					    type, err ? NULL : &flags, &params);
+
+	dev = __dev_get_by_index(&init_net, ifindex);
+	WARN_ON(!dev || (!err && dev->ieee80211_ptr->iftype != type));
+
 	rtnl_unlock();
 
  unlock:
-- 
cgit v1.2.3


From 723b038def23ce0606754c4f598cbb96bae9a102 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 16 Sep 2008 20:22:09 +0200
Subject: cfg80211: allow set_interface without type

Which then causes no type change.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/nl80211.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 44771a690d5..a745932d137 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -422,19 +422,20 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 
 	memset(&params, 0, sizeof(params));
 
-	if (info->attrs[NL80211_ATTR_IFTYPE]) {
-		type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
-		if (type > NL80211_IFTYPE_MAX)
-			return -EINVAL;
-	} else
-		return -EINVAL;
-
 	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
 	if (err)
 		return err;
 	ifindex = dev->ifindex;
+	type = dev->ieee80211_ptr->iftype;
 	dev_put(dev);
 
+	err = -EINVAL;
+	if (info->attrs[NL80211_ATTR_IFTYPE]) {
+		type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
+		if (type > NL80211_IFTYPE_MAX)
+			goto unlock;
+	}
+
 	if (!drv->ops->change_virtual_intf ||
 	    !(drv->wiphy.interface_modes & (1 << type))) {
 		err = -EOPNOTSUPP;
-- 
cgit v1.2.3


From f8b25cdad719cddceb9cf0d350065b3e59e74219 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 16 Sep 2008 20:22:21 +0200
Subject: mac80211: allow interface settings changes only when down

We currently allow monitor flags changes and mesh ID changes when
the interface is up, which can lead to trouble. Change it to only
allow when down.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index a8501f14b16..89a183c2327 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -100,6 +100,9 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex,
 	if (ret)
 		return ret;
 
+	if (netif_running(sdata->dev))
+		return -EBUSY;
+
 	if (ieee80211_vif_is_mesh(&sdata->vif) && params->mesh_id_len)
 		ieee80211_sdata_set_mesh_id(sdata,
 					    params->mesh_id_len,
-- 
cgit v1.2.3


From 92ffe055c3ea45856183bebed62f8880f75fef3b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 16 Sep 2008 20:39:36 +0200
Subject: cfg80211: reject invalid configuration items

Reject configuring mesh-id for non-mesh, monitor flags for non-monitor.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/nl80211.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a745932d137..572793c8c7a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -418,7 +418,7 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	int err, ifindex;
 	enum nl80211_iftype type;
 	struct net_device *dev;
-	u32 flags;
+	u32 _flags, *flags = NULL;
 
 	memset(&params, 0, sizeof(params));
 
@@ -442,18 +442,28 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	if (type == NL80211_IFTYPE_MESH_POINT &&
-	    info->attrs[NL80211_ATTR_MESH_ID]) {
+	if (info->attrs[NL80211_ATTR_MESH_ID]) {
+		if (type != NL80211_IFTYPE_MESH_POINT) {
+			err = -EINVAL;
+			goto unlock;
+		}
 		params.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]);
 		params.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
 	}
 
+	if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
+		if (type != NL80211_IFTYPE_MONITOR) {
+			err = -EINVAL;
+			goto unlock;
+		}
+		err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS],
+					  &_flags);
+		if (!err)
+			flags = &_flags;
+	}
 	rtnl_lock();
-	err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
-				  info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL,
-				  &flags);
 	err = drv->ops->change_virtual_intf(&drv->wiphy, ifindex,
-					    type, err ? NULL : &flags, &params);
+					    type, flags, &params);
 
 	dev = __dev_get_by_index(&init_net, ifindex);
 	WARN_ON(!dev || (!err && dev->ieee80211_ptr->iftype != type));
-- 
cgit v1.2.3


From 79617deeebb9cf089e2bc2aad19743b1209043f6 Mon Sep 17 00:00:00 2001
From: YanBo <dreamfly281@gmail.com>
Date: Mon, 22 Sep 2008 13:30:32 +0800
Subject: mac80211: mesh portal functionality support

Currently the mesh code doesn't support bridging mesh point interfaces
with wired ethernet or AP to construct an MPP or MAP. This patch adds
code to support the "6 address frame format packet" functionality to
mesh point interfaces. Now the mesh network can be used as backhaul
for end to end communication.

Signed-off-by: Li YanBo <dreamfly281@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mesh.h         |   4 ++
 net/mac80211/mesh_pathtbl.c | 127 +++++++++++++++++++++++++++++++++++++++++++-
 net/mac80211/rx.c           |  32 +++++++++--
 net/mac80211/tx.c           |  44 ++++++++++++---
 4 files changed, 196 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 8ee414a0447..e10471c6ba4 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -71,6 +71,7 @@ enum mesh_path_flags {
  */
 struct mesh_path {
 	u8 dst[ETH_ALEN];
+	u8 mpp[ETH_ALEN];	/* used for MPP or MAP */
 	struct ieee80211_sub_if_data *sdata;
 	struct sta_info *next_hop;
 	struct timer_list timer;
@@ -226,6 +227,9 @@ int mesh_nexthop_lookup(struct sk_buff *skb,
 void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata);
 struct mesh_path *mesh_path_lookup(u8 *dst,
 		struct ieee80211_sub_if_data *sdata);
+struct mesh_path *mpp_path_lookup(u8 *dst,
+				  struct ieee80211_sub_if_data *sdata);
+int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata);
 struct mesh_path *mesh_path_lookup_by_idx(int idx,
 		struct ieee80211_sub_if_data *sdata);
 void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop);
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index e4fa2905fad..3c72557df45 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -36,6 +36,7 @@ struct mpath_node {
 };
 
 static struct mesh_table *mesh_paths;
+static struct mesh_table *mpp_paths; /* Store paths for MPP&MAP */
 
 /* This lock will have the grow table function as writer and add / delete nodes
  * as readers. When reading the table (i.e. doing lookups) we are well protected
@@ -94,6 +95,34 @@ struct mesh_path *mesh_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata)
 	return NULL;
 }
 
+struct mesh_path *mpp_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_path *mpath;
+	struct hlist_node *n;
+	struct hlist_head *bucket;
+	struct mesh_table *tbl;
+	struct mpath_node *node;
+
+	tbl = rcu_dereference(mpp_paths);
+
+	bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)];
+	hlist_for_each_entry_rcu(node, n, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata &&
+		    memcmp(dst, mpath->dst, ETH_ALEN) == 0) {
+			if (MPATH_EXPIRED(mpath)) {
+				spin_lock_bh(&mpath->state_lock);
+				if (MPATH_EXPIRED(mpath))
+					mpath->flags &= ~MESH_PATH_ACTIVE;
+				spin_unlock_bh(&mpath->state_lock);
+			}
+			return mpath;
+		}
+	}
+	return NULL;
+}
+
+
 /**
  * mesh_path_lookup_by_idx - look up a path in the mesh path table by its index
  * @idx: index
@@ -226,6 +255,91 @@ err_path_alloc:
 }
 
 
+int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_path *mpath, *new_mpath;
+	struct mpath_node *node, *new_node;
+	struct hlist_head *bucket;
+	struct hlist_node *n;
+	int grow = 0;
+	int err = 0;
+	u32 hash_idx;
+
+
+	if (memcmp(dst, sdata->dev->dev_addr, ETH_ALEN) == 0)
+		/* never add ourselves as neighbours */
+		return -ENOTSUPP;
+
+	if (is_multicast_ether_addr(dst))
+		return -ENOTSUPP;
+
+	err = -ENOMEM;
+	new_mpath = kzalloc(sizeof(struct mesh_path), GFP_KERNEL);
+	if (!new_mpath)
+		goto err_path_alloc;
+
+	new_node = kmalloc(sizeof(struct mpath_node), GFP_KERNEL);
+	if (!new_node)
+		goto err_node_alloc;
+
+	read_lock(&pathtbl_resize_lock);
+	memcpy(new_mpath->dst, dst, ETH_ALEN);
+	memcpy(new_mpath->mpp, mpp, ETH_ALEN);
+	new_mpath->sdata = sdata;
+	new_mpath->flags = 0;
+	skb_queue_head_init(&new_mpath->frame_queue);
+	new_node->mpath = new_mpath;
+	new_mpath->exp_time = jiffies;
+	spin_lock_init(&new_mpath->state_lock);
+
+	hash_idx = mesh_table_hash(dst, sdata, mpp_paths);
+	bucket = &mpp_paths->hash_buckets[hash_idx];
+
+	spin_lock(&mpp_paths->hashwlock[hash_idx]);
+
+	err = -EEXIST;
+	hlist_for_each_entry(node, n, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata && memcmp(dst, mpath->dst, ETH_ALEN) == 0)
+			goto err_exists;
+	}
+
+	hlist_add_head_rcu(&new_node->list, bucket);
+	if (atomic_inc_return(&mpp_paths->entries) >=
+		mpp_paths->mean_chain_len * (mpp_paths->hash_mask + 1))
+		grow = 1;
+
+	spin_unlock(&mpp_paths->hashwlock[hash_idx]);
+	read_unlock(&pathtbl_resize_lock);
+	if (grow) {
+		struct mesh_table *oldtbl, *newtbl;
+
+		write_lock(&pathtbl_resize_lock);
+		oldtbl = mpp_paths;
+		newtbl = mesh_table_grow(mpp_paths);
+		if (!newtbl) {
+			write_unlock(&pathtbl_resize_lock);
+			return 0;
+		}
+		rcu_assign_pointer(mpp_paths, newtbl);
+		write_unlock(&pathtbl_resize_lock);
+
+		synchronize_rcu();
+		mesh_table_free(oldtbl, false);
+	}
+	return 0;
+
+err_exists:
+	spin_unlock(&mpp_paths->hashwlock[hash_idx]);
+	read_unlock(&pathtbl_resize_lock);
+	kfree(new_node);
+err_node_alloc:
+	kfree(new_mpath);
+err_path_alloc:
+	return err;
+}
+
+
 /**
  * mesh_plink_broken - deactivates paths and sends perr when a link breaks
  *
@@ -475,11 +589,21 @@ static int mesh_path_node_copy(struct hlist_node *p, struct mesh_table *newtbl)
 int mesh_pathtbl_init(void)
 {
 	mesh_paths = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+	if (!mesh_paths)
+		return -ENOMEM;
 	mesh_paths->free_node = &mesh_path_node_free;
 	mesh_paths->copy_node = &mesh_path_node_copy;
 	mesh_paths->mean_chain_len = MEAN_CHAIN_LEN;
-	if (!mesh_paths)
+
+	mpp_paths = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+	if (!mpp_paths) {
+		mesh_table_free(mesh_paths, true);
 		return -ENOMEM;
+	}
+	mpp_paths->free_node = &mesh_path_node_free;
+	mpp_paths->copy_node = &mesh_path_node_copy;
+	mpp_paths->mean_chain_len = MEAN_CHAIN_LEN;
+
 	return 0;
 }
 
@@ -511,4 +635,5 @@ void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
 void mesh_pathtbl_unregister(void)
 {
 	mesh_table_free(mesh_paths, true);
+	mesh_table_free(mpp_paths, true);
 }
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3ab9670f180..2efa4dd47b5 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1107,10 +1107,6 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
 
-	if (ieee80211_vif_is_mesh(&sdata->vif))
-		hdrlen += ieee80211_get_mesh_hdrlen(
-				(struct ieee80211s_hdr *) (skb->data + hdrlen));
-
 	/* convert IEEE 802.11 header + possible LLC headers into Ethernet
 	 * header
 	 * IEEE 802.11 address fields:
@@ -1134,6 +1130,15 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 		if (unlikely(sdata->vif.type != NL80211_IFTYPE_WDS &&
 			     sdata->vif.type != NL80211_IFTYPE_MESH_POINT))
 			return -1;
+		if (ieee80211_vif_is_mesh(&sdata->vif)) {
+			struct ieee80211s_hdr *meshdr = (struct ieee80211s_hdr *)
+				(skb->data + hdrlen);
+			hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+			if (meshdr->flags & MESH_FLAGS_AE_A5_A6) {
+				memcpy(dst, meshdr->eaddr1, ETH_ALEN);
+				memcpy(src, meshdr->eaddr2, ETH_ALEN);
+			}
+		}
 		break;
 	case __constant_cpu_to_le16(IEEE80211_FCTL_FROMDS):
 		if (sdata->vif.type != NL80211_IFTYPE_STATION ||
@@ -1393,6 +1398,25 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 		/* illegal frame */
 		return RX_DROP_MONITOR;
 
+	if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6){
+		struct ieee80211_sub_if_data *sdata;
+		struct mesh_path *mppath;
+
+		sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev);
+		rcu_read_lock();
+		mppath = mpp_path_lookup(mesh_hdr->eaddr2, sdata);
+		if (!mppath) {
+			mpp_path_add(mesh_hdr->eaddr2, hdr->addr4, sdata);
+		} else {
+			spin_lock_bh(&mppath->state_lock);
+			mppath->exp_time = jiffies;
+			if (compare_ether_addr(mppath->mpp, hdr->addr4) != 0)
+				memcpy(mppath->mpp, hdr->addr4, ETH_ALEN);
+			spin_unlock_bh(&mppath->state_lock);
+		}
+		rcu_read_unlock();
+	}
+
 	if (compare_ether_addr(rx->dev->dev_addr, hdr->addr3) == 0)
 		return RX_CONTINUE;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 00d798cc9e0..00d96e63dce 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1498,18 +1498,50 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 #ifdef CONFIG_MAC80211_MESH
 	case NL80211_IFTYPE_MESH_POINT:
 		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
-		/* RA TA DA SA */
-		memset(hdr.addr1, 0, ETH_ALEN);
-		memcpy(hdr.addr2, dev->dev_addr, ETH_ALEN);
-		memcpy(hdr.addr3, skb->data, ETH_ALEN);
-		memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
 		if (!sdata->u.mesh.mshcfg.dot11MeshTTL) {
 			/* Do not send frames with mesh_ttl == 0 */
 			sdata->u.mesh.mshstats.dropped_frames_ttl++;
 			ret = 0;
 			goto fail;
 		}
-		meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata);
+		memset(&mesh_hdr, 0, sizeof(mesh_hdr));
+
+		if (compare_ether_addr(dev->dev_addr,
+					  skb->data + ETH_ALEN) == 0) {
+			/* RA TA DA SA */
+			memset(hdr.addr1, 0, ETH_ALEN);
+			memcpy(hdr.addr2, dev->dev_addr, ETH_ALEN);
+			memcpy(hdr.addr3, skb->data, ETH_ALEN);
+			memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
+			meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata);
+		} else {
+			/* packet from other interface */
+			struct mesh_path *mppath;
+
+			memset(hdr.addr1, 0, ETH_ALEN);
+			memcpy(hdr.addr2, dev->dev_addr, ETH_ALEN);
+			memcpy(hdr.addr4, dev->dev_addr, ETH_ALEN);
+
+			if (is_multicast_ether_addr(skb->data))
+				memcpy(hdr.addr3, skb->data, ETH_ALEN);
+			else {
+				rcu_read_lock();
+				mppath = mpp_path_lookup(skb->data, sdata);
+				if (mppath)
+					memcpy(hdr.addr3, mppath->mpp, ETH_ALEN);
+				else
+					memset(hdr.addr3, 0xff, ETH_ALEN);
+				rcu_read_unlock();
+			}
+
+			mesh_hdr.flags |= MESH_FLAGS_AE_A5_A6;
+			mesh_hdr.ttl = sdata->u.mesh.mshcfg.dot11MeshTTL;
+			put_unaligned(cpu_to_le32(sdata->u.mesh.mesh_seqnum), &mesh_hdr.seqnum);
+			memcpy(mesh_hdr.eaddr1, skb->data, ETH_ALEN);
+			memcpy(mesh_hdr.eaddr2, skb->data + ETH_ALEN, ETH_ALEN);
+			sdata->u.mesh.mesh_seqnum++;
+			meshhdrlen = 18;
+		}
 		hdrlen = 30;
 		break;
 #endif
-- 
cgit v1.2.3


From 2ff6a6d4e92270283432690adf53a7e5ab186d19 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 18 Sep 2008 12:24:20 +0200
Subject: mac80211: fix mesh action frame handling

When I split off the action frame handling I made the code drop
all action frames we don't want to handle. This is wrong since
some action frames are actually handled via rx_h_mgmt through
being queued to the sta/mesh implementations.

Thanks to Li YanBo for noticing the problem.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Li YanBo <dreamfly281@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 2efa4dd47b5..c489865761b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1557,7 +1557,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 	 */
 	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
 	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
-		return RX_DROP_MONITOR;
+		return RX_CONTINUE;
 
 	switch (mgmt->u.action.category) {
 	case WLAN_CATEGORY_BACK:
-- 
cgit v1.2.3


From 4b7679a561e552eeda1e3567119bef2bca99b66e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 18 Sep 2008 18:14:18 +0200
Subject: mac80211: clean up rate control API

Long awaited, hard work. This patch totally cleans up the rate control
API to remove the requirement to include internal headers outside of
net/mac80211/.

There's one internal use in the PID algorithm left for mesh networking,
we'll have to figure out a way to clean that one up and decide how to
do the peer link evaluation, possibly independent of the rate control
algorithm or via new API.

Additionally, ath9k is left using the cross-inclusion hack for now, we
will add new API where necessary to make this work properly, but right
now I'm not expert enough to do it. It's still off better than before.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c              |   2 +-
 net/mac80211/ieee80211_i.h      |   2 +
 net/mac80211/main.c             |   4 +-
 net/mac80211/mlme.c             |   4 +-
 net/mac80211/rate.c             |  71 ++++++++++++++----
 net/mac80211/rate.h             | 102 +++++++-------------------
 net/mac80211/rc80211_pid.h      |   2 -
 net/mac80211/rc80211_pid_algo.c | 158 +++++++++++++++-------------------------
 net/mac80211/sta_info.c         |  17 ++---
 net/mac80211/sta_info.h         |   2 +-
 net/mac80211/tx.c               |   5 +-
 11 files changed, 159 insertions(+), 210 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 89a183c2327..855126a3039 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -693,7 +693,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 
 	sta_apply_parameters(local, sta, params);
 
-	rate_control_rate_init(sta, local);
+	rate_control_rate_init(sta);
 
 	rcu_read_lock();
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 0b25b0f46b1..8025b294588 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -724,6 +724,8 @@ struct ieee80211_local {
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 	struct local_debugfsdentries {
+		struct dentry *rcdir;
+		struct dentry *rcname;
 		struct dentry *frequency;
 		struct dentry *antenna_sel_tx;
 		struct dentry *antenna_sel_rx;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 7d2d5a041e2..d608c44047c 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -542,6 +542,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	u16 frag, type;
 	__le16 fc;
+	struct ieee80211_supported_band *sband;
 	struct ieee80211_tx_status_rtap_hdr *rthdr;
 	struct ieee80211_sub_if_data *sdata;
 	struct net_device *prev_dev = NULL;
@@ -588,7 +589,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 			sta->tx_retry_count += info->status.retry_count;
 		}
 
-		rate_control_tx_status(local->mdev, skb);
+		sband = local->hw.wiphy->bands[info->band];
+		rate_control_tx_status(local, sband, sta, skb);
 	}
 
 	rcu_read_unlock();
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8611a8318c9..109c3a7e63a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1323,7 +1323,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		ieee80211_handle_ht(local, 1, &sta->sta.ht_info, &bss_info);
 	}
 
-	rate_control_rate_init(sta, local);
+	rate_control_rate_init(sta);
 
 	if (elems.wmm_param) {
 		set_sta_flags(sta, WLAN_STA_WME);
@@ -2342,7 +2342,7 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 	sta->sta.supp_rates[band] = supp_rates |
 			ieee80211_mandatory_rates(local, band);
 
-	rate_control_rate_init(sta, local);
+	rate_control_rate_init(sta);
 
 	if (sta_info_insert(sta))
 		return NULL;
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 0388c090dfe..5d786720d93 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -12,6 +12,7 @@
 #include <linux/rtnetlink.h>
 #include "rate.h"
 #include "ieee80211_i.h"
+#include "debugfs.h"
 
 struct rate_control_alg {
 	struct list_head list;
@@ -127,19 +128,46 @@ static void ieee80211_rate_control_ops_put(struct rate_control_ops *ops)
 	module_put(ops->module);
 }
 
+#ifdef CONFIG_MAC80211_DEBUGFS
+static ssize_t rcname_read(struct file *file, char __user *userbuf,
+			   size_t count, loff_t *ppos)
+{
+	struct rate_control_ref *ref = file->private_data;
+	int len = strlen(ref->ops->name);
+
+	return simple_read_from_buffer(userbuf, count, ppos,
+				       ref->ops->name, len);
+}
+
+static const struct file_operations rcname_ops = {
+	.read = rcname_read,
+	.open = mac80211_open_file_generic,
+};
+#endif
+
 struct rate_control_ref *rate_control_alloc(const char *name,
 					    struct ieee80211_local *local)
 {
+	struct dentry *debugfsdir = NULL;
 	struct rate_control_ref *ref;
 
 	ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL);
 	if (!ref)
 		goto fail_ref;
 	kref_init(&ref->kref);
+	ref->local = local;
 	ref->ops = ieee80211_rate_control_ops_get(name);
 	if (!ref->ops)
 		goto fail_ops;
-	ref->priv = ref->ops->alloc(local);
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir);
+	local->debugfs.rcdir = debugfsdir;
+	local->debugfs.rcname = debugfs_create_file("name", 0400, debugfsdir,
+						    ref, &rcname_ops);
+#endif
+
+	ref->priv = ref->ops->alloc(&local->hw, debugfsdir);
 	if (!ref->priv)
 		goto fail_priv;
 	return ref;
@@ -158,29 +186,46 @@ static void rate_control_release(struct kref *kref)
 
 	ctrl_ref = container_of(kref, struct rate_control_ref, kref);
 	ctrl_ref->ops->free(ctrl_ref->priv);
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	debugfs_remove(ctrl_ref->local->debugfs.rcname);
+	ctrl_ref->local->debugfs.rcname = NULL;
+	debugfs_remove(ctrl_ref->local->debugfs.rcdir);
+	ctrl_ref->local->debugfs.rcdir = NULL;
+#endif
+
 	ieee80211_rate_control_ops_put(ctrl_ref->ops);
 	kfree(ctrl_ref);
 }
 
-void rate_control_get_rate(struct net_device *dev,
+void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
 			   struct ieee80211_supported_band *sband,
-			   struct sk_buff *skb,
+			   struct sta_info *sta, struct sk_buff *skb,
 			   struct rate_selection *sel)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct rate_control_ref *ref = local->rate_ctrl;
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct sta_info *sta;
+	struct rate_control_ref *ref = sdata->local->rate_ctrl;
+	void *priv_sta = NULL;
+	struct ieee80211_sta *ista = NULL;
 	int i;
 
-	rcu_read_lock();
-	sta = sta_info_get(local, hdr->addr1);
-
 	sel->rate_idx = -1;
 	sel->nonerp_idx = -1;
 	sel->probe_idx = -1;
+	sel->max_rate_idx = sdata->max_ratectrl_rateidx;
+
+	if (sta) {
+		ista = &sta->sta;
+		priv_sta = sta->rate_ctrl_priv;
+	}
+
+	if (sta && sdata->force_unicast_rateidx > -1)
+		sel->rate_idx = sdata->force_unicast_rateidx;
+	else
+		ref->ops->get_rate(ref->priv, sband, ista, priv_sta, skb, sel);
 
-	ref->ops->get_rate(ref->priv, dev, sband, skb, sel);
+	if (sdata->max_ratectrl_rateidx > -1 &&
+	    sel->rate_idx > sdata->max_ratectrl_rateidx)
+		sel->rate_idx = sdata->max_ratectrl_rateidx;
 
 	BUG_ON(sel->rate_idx < 0);
 
@@ -191,13 +236,11 @@ void rate_control_get_rate(struct net_device *dev,
 			if (sband->bitrates[sel->rate_idx].bitrate < rate->bitrate)
 				break;
 
-			if (rate_supported(sta, sband->band, i) &&
+			if (rate_supported(ista, sband->band, i) &&
 			    !(rate->flags & IEEE80211_RATE_ERP_G))
 				sel->nonerp_idx = i;
 		}
 	}
-
-	rcu_read_unlock();
 }
 
 struct rate_control_ref *rate_control_get(struct rate_control_ref *ref)
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 5f18c27eb90..eb94e584d24 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -19,77 +19,48 @@
 #include "ieee80211_i.h"
 #include "sta_info.h"
 
-/**
- * struct rate_selection - rate selection for rate control algos
- * @rate: selected transmission rate index
- * @nonerp: Non-ERP rate to use instead if ERP cannot be used
- * @probe: rate for probing (or -1)
- *
- */
-struct rate_selection {
-	s8 rate_idx, nonerp_idx, probe_idx;
-};
-
-struct rate_control_ops {
-	struct module *module;
-	const char *name;
-	void (*tx_status)(void *priv, struct net_device *dev,
-			  struct sk_buff *skb);
-	void (*get_rate)(void *priv, struct net_device *dev,
-			 struct ieee80211_supported_band *band,
-			 struct sk_buff *skb,
-			 struct rate_selection *sel);
-	void (*rate_init)(void *priv, void *priv_sta,
-			  struct ieee80211_local *local, struct sta_info *sta);
-	void (*clear)(void *priv);
-
-	void *(*alloc)(struct ieee80211_local *local);
-	void (*free)(void *priv);
-	void *(*alloc_sta)(void *priv, gfp_t gfp);
-	void (*free_sta)(void *priv, void *priv_sta);
-
-	int (*add_attrs)(void *priv, struct kobject *kobj);
-	void (*remove_attrs)(void *priv, struct kobject *kobj);
-	void (*add_sta_debugfs)(void *priv, void *priv_sta,
-				struct dentry *dir);
-	void (*remove_sta_debugfs)(void *priv, void *priv_sta);
-};
-
 struct rate_control_ref {
+	struct ieee80211_local *local;
 	struct rate_control_ops *ops;
 	void *priv;
 	struct kref kref;
 };
 
-int ieee80211_rate_control_register(struct rate_control_ops *ops);
-void ieee80211_rate_control_unregister(struct rate_control_ops *ops);
-
 /* Get a reference to the rate control algorithm. If `name' is NULL, get the
  * first available algorithm. */
 struct rate_control_ref *rate_control_alloc(const char *name,
 					    struct ieee80211_local *local);
-void rate_control_get_rate(struct net_device *dev,
+void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
 			   struct ieee80211_supported_band *sband,
-			   struct sk_buff *skb,
+			   struct sta_info *sta, struct sk_buff *skb,
 			   struct rate_selection *sel);
 struct rate_control_ref *rate_control_get(struct rate_control_ref *ref);
 void rate_control_put(struct rate_control_ref *ref);
 
-static inline void rate_control_tx_status(struct net_device *dev,
+static inline void rate_control_tx_status(struct ieee80211_local *local,
+					  struct ieee80211_supported_band *sband,
+					  struct sta_info *sta,
 					  struct sk_buff *skb)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct rate_control_ref *ref = local->rate_ctrl;
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
 
-	ref->ops->tx_status(ref->priv, dev, skb);
+	ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb);
 }
 
 
-static inline void rate_control_rate_init(struct sta_info *sta,
-					  struct ieee80211_local *local)
+static inline void rate_control_rate_init(struct sta_info *sta)
 {
+	struct ieee80211_local *local = sta->sdata->local;
 	struct rate_control_ref *ref = sta->rate_ctrl;
-	ref->ops->rate_init(ref->priv, sta->rate_ctrl_priv, local, sta);
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
+	struct ieee80211_supported_band *sband;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	ref->ops->rate_init(ref->priv, sband, ista, priv_sta);
 }
 
 
@@ -100,15 +71,19 @@ static inline void rate_control_clear(struct ieee80211_local *local)
 }
 
 static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
+					   struct ieee80211_sta *sta,
 					   gfp_t gfp)
 {
-	return ref->ops->alloc_sta(ref->priv, gfp);
+	return ref->ops->alloc_sta(ref->priv, sta, gfp);
 }
 
-static inline void rate_control_free_sta(struct rate_control_ref *ref,
-					 void *priv)
+static inline void rate_control_free_sta(struct sta_info *sta)
 {
-	ref->ops->free_sta(ref->priv, priv);
+	struct rate_control_ref *ref = sta->rate_ctrl;
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
+
+	ref->ops->free_sta(ref->priv, ista, priv_sta);
 }
 
 static inline void rate_control_add_sta_debugfs(struct sta_info *sta)
@@ -130,31 +105,6 @@ static inline void rate_control_remove_sta_debugfs(struct sta_info *sta)
 #endif
 }
 
-static inline int rate_supported(struct sta_info *sta,
-				 enum ieee80211_band band,
-				 int index)
-{
-	return (sta == NULL || sta->sta.supp_rates[band] & BIT(index));
-}
-
-static inline s8
-rate_lowest_index(struct ieee80211_local *local,
-		  struct ieee80211_supported_band *sband,
-		  struct sta_info *sta)
-{
-	int i;
-
-	for (i = 0; i < sband->n_bitrates; i++)
-		if (rate_supported(sta, sband->band, i))
-			return i;
-
-	/* warn when we cannot find a rate. */
-	WARN_ON(1);
-
-	return 0;
-}
-
-
 /* functions for rate control related to a device */
 int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
 				 const char *name);
diff --git a/net/mac80211/rc80211_pid.h b/net/mac80211/rc80211_pid.h
index ffafc5da572..01d64d53f3b 100644
--- a/net/mac80211/rc80211_pid.h
+++ b/net/mac80211/rc80211_pid.h
@@ -124,7 +124,6 @@ struct rc_pid_events_file_info {
  * struct rc_pid_debugfs_entries - tunable parameters
  *
  * Algorithm parameters, tunable via debugfs.
- * @dir: the debugfs directory for a specific phy
  * @target: target percentage for failed frames
  * @sampling_period: error sampling interval in milliseconds
  * @coeff_p: absolute value of the proportional coefficient
@@ -143,7 +142,6 @@ struct rc_pid_events_file_info {
  *	ordering of rates)
  */
 struct rc_pid_debugfs_entries {
-	struct dentry *dir;
 	struct dentry *target;
 	struct dentry *sampling_period;
 	struct dentry *coeff_p;
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index bc1c4569caa..86eb374e3b8 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -68,18 +68,14 @@
  * exhibited a worse failed frames behaviour and we'll choose the highest rate
  * whose failed frames behaviour is not worse than the one of the original rate
  * target. While at it, check that the new rate is valid. */
-static void rate_control_pid_adjust_rate(struct ieee80211_local *local,
-					 struct sta_info *sta, int adj,
+static void rate_control_pid_adjust_rate(struct ieee80211_supported_band *sband,
+					 struct ieee80211_sta *sta,
+					 struct rc_pid_sta_info *spinfo, int adj,
 					 struct rc_pid_rateinfo *rinfo)
 {
-	struct ieee80211_sub_if_data *sdata;
-	struct ieee80211_supported_band *sband;
 	int cur_sorted, new_sorted, probe, tmp, n_bitrates, band;
-	struct rc_pid_sta_info *spinfo = (void *)sta->rate_ctrl_priv;
 	int cur = spinfo->txrate_idx;
 
-	sdata = sta->sdata;
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 	band = sband->band;
 	n_bitrates = sband->n_bitrates;
 
@@ -146,13 +142,11 @@ static void rate_control_pid_normalize(struct rc_pid_info *pinfo, int l)
 }
 
 static void rate_control_pid_sample(struct rc_pid_info *pinfo,
-				    struct ieee80211_local *local,
-				    struct sta_info *sta)
+				    struct ieee80211_supported_band *sband,
+				    struct ieee80211_sta *sta,
+				    struct rc_pid_sta_info *spinfo)
 {
-	struct ieee80211_sub_if_data *sdata = sta->sdata;
-	struct rc_pid_sta_info *spinfo = sta->rate_ctrl_priv;
 	struct rc_pid_rateinfo *rinfo = pinfo->rinfo;
-	struct ieee80211_supported_band *sband;
 	u32 pf;
 	s32 err_avg;
 	u32 err_prop;
@@ -161,9 +155,6 @@ static void rate_control_pid_sample(struct rc_pid_info *pinfo,
 	int adj, i, j, tmp;
 	unsigned long period;
 
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-	spinfo = sta->rate_ctrl_priv;
-
 	/* In case nothing happened during the previous control interval, turn
 	 * the sharpening factor on. */
 	period = (HZ * pinfo->sampling_period + 500) / 1000;
@@ -179,11 +170,15 @@ static void rate_control_pid_sample(struct rc_pid_info *pinfo,
 	if (unlikely(spinfo->tx_num_xmit == 0))
 		pf = spinfo->last_pf;
 	else {
+		/* XXX: BAD HACK!!! */
+		struct sta_info *si = container_of(sta, struct sta_info, sta);
+
 		pf = spinfo->tx_num_failed * 100 / spinfo->tx_num_xmit;
-		if (ieee80211_vif_is_mesh(&sdata->vif) && pf == 100)
-			mesh_plink_broken(sta);
+
+		if (ieee80211_vif_is_mesh(&si->sdata->vif) && pf == 100)
+			mesh_plink_broken(si);
 		pf <<= RC_PID_ARITH_SHIFT;
-		sta->fail_avg = ((pf + (spinfo->last_pf << 3)) / 9)
+		si->fail_avg = ((pf + (spinfo->last_pf << 3)) / 9)
 					>> RC_PID_ARITH_SHIFT;
 	}
 
@@ -229,43 +224,25 @@ static void rate_control_pid_sample(struct rc_pid_info *pinfo,
 
 	/* Change rate. */
 	if (adj)
-		rate_control_pid_adjust_rate(local, sta, adj, rinfo);
+		rate_control_pid_adjust_rate(sband, sta, spinfo, adj, rinfo);
 }
 
-static void rate_control_pid_tx_status(void *priv, struct net_device *dev,
+static void rate_control_pid_tx_status(void *priv, struct ieee80211_supported_band *sband,
+				       struct ieee80211_sta *sta, void *priv_sta,
 				       struct sk_buff *skb)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct ieee80211_sub_if_data *sdata;
 	struct rc_pid_info *pinfo = priv;
-	struct sta_info *sta;
-	struct rc_pid_sta_info *spinfo;
+	struct rc_pid_sta_info *spinfo = priv_sta;
 	unsigned long period;
-	struct ieee80211_supported_band *sband;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
-	rcu_read_lock();
-
-	sta = sta_info_get(local, hdr->addr1);
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
-	if (!sta)
-		goto unlock;
-
-	spinfo = sta->rate_ctrl_priv;
-
-	/* Don't update the state if we're not controlling the rate. */
-	sdata = sta->sdata;
-	if (sdata->force_unicast_rateidx > -1) {
-		spinfo->txrate_idx = sdata->max_ratectrl_rateidx;
-		goto unlock;
-	}
+	if (!spinfo)
+		return;
 
 	/* Ignore all frames that were sent with a different rate than the rate
 	 * we currently advise mac80211 to use. */
 	if (info->tx_rate_idx != spinfo->txrate_idx)
-		goto unlock;
+		return;
 
 	spinfo->tx_num_xmit++;
 
@@ -289,78 +266,63 @@ static void rate_control_pid_tx_status(void *priv, struct net_device *dev,
 	if (!period)
 		period = 1;
 	if (time_after(jiffies, spinfo->last_sample + period))
-		rate_control_pid_sample(pinfo, local, sta);
-
- unlock:
-	rcu_read_unlock();
+		rate_control_pid_sample(pinfo, sband, sta, spinfo);
 }
 
-static void rate_control_pid_get_rate(void *priv, struct net_device *dev,
-				      struct ieee80211_supported_band *sband,
-				      struct sk_buff *skb,
-				      struct rate_selection *sel)
+static void
+rate_control_pid_get_rate(void *priv, struct ieee80211_supported_band *sband,
+			  struct ieee80211_sta *sta, void *priv_sta,
+			  struct sk_buff *skb,
+			  struct rate_selection *sel)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct ieee80211_sub_if_data *sdata;
-	struct rc_pid_sta_info *spinfo;
-	struct sta_info *sta;
+	struct rc_pid_sta_info *spinfo = priv_sta;
 	int rateidx;
 	u16 fc;
 
-	rcu_read_lock();
-
-	sta = sta_info_get(local, hdr->addr1);
-
 	/* Send management frames and broadcast/multicast data using lowest
 	 * rate. */
 	fc = le16_to_cpu(hdr->frame_control);
-	if ((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA ||
-	    is_multicast_ether_addr(hdr->addr1) || !sta) {
-		sel->rate_idx = rate_lowest_index(local, sband, sta);
-		rcu_read_unlock();
+	if (!sta || !spinfo ||
+	    (fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA ||
+	    is_multicast_ether_addr(hdr->addr1)) {
+		sel->rate_idx = rate_lowest_index(sband, sta);
 		return;
 	}
 
-	/* If a forced rate is in effect, select it. */
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	spinfo = (struct rc_pid_sta_info *)sta->rate_ctrl_priv;
-	if (sdata->force_unicast_rateidx > -1)
-		spinfo->txrate_idx = sdata->force_unicast_rateidx;
-
 	rateidx = spinfo->txrate_idx;
 
 	if (rateidx >= sband->n_bitrates)
 		rateidx = sband->n_bitrates - 1;
 
-	rcu_read_unlock();
-
 	sel->rate_idx = rateidx;
 
 #ifdef CONFIG_MAC80211_DEBUGFS
-	rate_control_pid_event_tx_rate(
-		&((struct rc_pid_sta_info *) sta->rate_ctrl_priv)->events,
+	rate_control_pid_event_tx_rate(&spinfo->events,
 		rateidx, sband->bitrates[rateidx].bitrate);
 #endif
 }
 
-static void rate_control_pid_rate_init(void *priv, void *priv_sta,
-					  struct ieee80211_local *local,
-					  struct sta_info *sta)
+static void
+rate_control_pid_rate_init(void *priv, struct ieee80211_supported_band *sband,
+			   struct ieee80211_sta *sta, void *priv_sta)
 {
+	struct rc_pid_sta_info *spinfo = priv_sta;
+	struct sta_info *si;
+
 	/* TODO: This routine should consider using RSSI from previous packets
 	 * as we need to have IEEE 802.1X auth succeed immediately after assoc..
 	 * Until that method is implemented, we will use the lowest supported
 	 * rate as a workaround. */
-	struct ieee80211_supported_band *sband;
-	struct rc_pid_sta_info *spinfo = (void *)sta->rate_ctrl_priv;
 
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-	spinfo->txrate_idx = rate_lowest_index(local, sband, sta);
-	sta->fail_avg = 0;
+	spinfo->txrate_idx = rate_lowest_index(sband, sta);
+	/* HACK */
+	si = container_of(sta, struct sta_info, sta);
+	si->fail_avg = 0;
 }
 
-static void *rate_control_pid_alloc(struct ieee80211_local *local)
+static void *rate_control_pid_alloc(struct ieee80211_hw *hw,
+				    struct dentry *debugfsdir)
 {
 	struct rc_pid_info *pinfo;
 	struct rc_pid_rateinfo *rinfo;
@@ -371,7 +333,7 @@ static void *rate_control_pid_alloc(struct ieee80211_local *local)
 	struct rc_pid_debugfs_entries *de;
 #endif
 
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	sband = hw->wiphy->bands[hw->conf.channel->band];
 
 	pinfo = kmalloc(sizeof(*pinfo), GFP_ATOMIC);
 	if (!pinfo)
@@ -426,30 +388,28 @@ static void *rate_control_pid_alloc(struct ieee80211_local *local)
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 	de = &pinfo->dentries;
-	de->dir = debugfs_create_dir("rc80211_pid",
-				     local->hw.wiphy->debugfsdir);
 	de->target = debugfs_create_u32("target_pf", S_IRUSR | S_IWUSR,
-					de->dir, &pinfo->target);
+					debugfsdir, &pinfo->target);
 	de->sampling_period = debugfs_create_u32("sampling_period",
-						 S_IRUSR | S_IWUSR, de->dir,
+						 S_IRUSR | S_IWUSR, debugfsdir,
 						 &pinfo->sampling_period);
 	de->coeff_p = debugfs_create_u32("coeff_p", S_IRUSR | S_IWUSR,
-					 de->dir, &pinfo->coeff_p);
+					 debugfsdir, &pinfo->coeff_p);
 	de->coeff_i = debugfs_create_u32("coeff_i", S_IRUSR | S_IWUSR,
-					 de->dir, &pinfo->coeff_i);
+					 debugfsdir, &pinfo->coeff_i);
 	de->coeff_d = debugfs_create_u32("coeff_d", S_IRUSR | S_IWUSR,
-					 de->dir, &pinfo->coeff_d);
+					 debugfsdir, &pinfo->coeff_d);
 	de->smoothing_shift = debugfs_create_u32("smoothing_shift",
-						 S_IRUSR | S_IWUSR, de->dir,
+						 S_IRUSR | S_IWUSR, debugfsdir,
 						 &pinfo->smoothing_shift);
 	de->sharpen_factor = debugfs_create_u32("sharpen_factor",
-					       S_IRUSR | S_IWUSR, de->dir,
+					       S_IRUSR | S_IWUSR, debugfsdir,
 					       &pinfo->sharpen_factor);
 	de->sharpen_duration = debugfs_create_u32("sharpen_duration",
-						  S_IRUSR | S_IWUSR, de->dir,
+						  S_IRUSR | S_IWUSR, debugfsdir,
 						  &pinfo->sharpen_duration);
 	de->norm_offset = debugfs_create_u32("norm_offset",
-					     S_IRUSR | S_IWUSR, de->dir,
+					     S_IRUSR | S_IWUSR, debugfsdir,
 					     &pinfo->norm_offset);
 #endif
 
@@ -471,7 +431,6 @@ static void rate_control_pid_free(void *priv)
 	debugfs_remove(de->coeff_p);
 	debugfs_remove(de->sampling_period);
 	debugfs_remove(de->target);
-	debugfs_remove(de->dir);
 #endif
 
 	kfree(pinfo->rinfo);
@@ -482,7 +441,8 @@ static void rate_control_pid_clear(void *priv)
 {
 }
 
-static void *rate_control_pid_alloc_sta(void *priv, gfp_t gfp)
+static void *rate_control_pid_alloc_sta(void *priv, struct ieee80211_sta *sta,
+					gfp_t gfp)
 {
 	struct rc_pid_sta_info *spinfo;
 
@@ -500,10 +460,10 @@ static void *rate_control_pid_alloc_sta(void *priv, gfp_t gfp)
 	return spinfo;
 }
 
-static void rate_control_pid_free_sta(void *priv, void *priv_sta)
+static void rate_control_pid_free_sta(void *priv, struct ieee80211_sta *sta,
+				      void *priv_sta)
 {
-	struct rc_pid_sta_info *spinfo = priv_sta;
-	kfree(spinfo);
+	kfree(priv_sta);
 }
 
 static struct rate_control_ops mac80211_rcpid = {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index d9774ac2e0f..9b72d15bc8d 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -93,8 +93,7 @@ static int sta_info_hash_del(struct ieee80211_local *local,
 }
 
 /* protected by RCU */
-static struct sta_info *__sta_info_find(struct ieee80211_local *local,
-					const u8 *addr)
+struct sta_info *sta_info_get(struct ieee80211_local *local, const u8 *addr)
 {
 	struct sta_info *sta;
 
@@ -107,12 +106,6 @@ static struct sta_info *__sta_info_find(struct ieee80211_local *local,
 	return sta;
 }
 
-struct sta_info *sta_info_get(struct ieee80211_local *local, u8 *addr)
-{
-	return __sta_info_find(local, addr);
-}
-EXPORT_SYMBOL(sta_info_get);
-
 struct sta_info *sta_info_get_by_idx(struct ieee80211_local *local, int idx,
 				     struct net_device *dev)
 {
@@ -146,7 +139,7 @@ static void __sta_info_free(struct ieee80211_local *local,
 {
 	DECLARE_MAC_BUF(mbuf);
 
-	rate_control_free_sta(sta->rate_ctrl, sta->rate_ctrl_priv);
+	rate_control_free_sta(sta);
 	rate_control_put(sta->rate_ctrl);
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
@@ -244,7 +237,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 
 	sta->rate_ctrl = rate_control_get(local->rate_ctrl);
 	sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl,
-						     gfp);
+						     &sta->sta, gfp);
 	if (!sta->rate_ctrl_priv) {
 		rate_control_put(sta->rate_ctrl);
 		kfree(sta);
@@ -308,7 +301,7 @@ int sta_info_insert(struct sta_info *sta)
 
 	spin_lock_irqsave(&local->sta_lock, flags);
 	/* check if STA exists already */
-	if (__sta_info_find(local, sta->sta.addr)) {
+	if (sta_info_get(local, sta->sta.addr)) {
 		spin_unlock_irqrestore(&local->sta_lock, flags);
 		err = -EEXIST;
 		goto out_free;
@@ -834,7 +827,7 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
 struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_hw *hw,
                                          const u8 *addr)
 {
-	struct sta_info *sta = __sta_info_find(hw_to_local(hw), addr);
+	struct sta_info *sta = sta_info_get(hw_to_local(hw), addr);
 
 	if (!sta)
 		return NULL;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index daedfa9e1c6..c3f43696462 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -416,7 +416,7 @@ static inline u32 get_sta_flags(struct sta_info *sta)
 /*
  * Get a STA info, must have be under RCU read lock.
  */
-struct sta_info *sta_info_get(struct ieee80211_local *local, u8 *addr);
+struct sta_info *sta_info_get(struct ieee80211_local *local, const u8 *addr);
 /*
  * Get STA info by index, BROKEN!
  */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 00d96e63dce..0cc2e23f082 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -446,7 +446,8 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 	sband = tx->local->hw.wiphy->bands[tx->channel->band];
 
 	if (likely(tx->rate_idx < 0)) {
-		rate_control_get_rate(tx->dev, sband, tx->skb, &rsel);
+		rate_control_get_rate(tx->sdata, sband, tx->sta,
+				      tx->skb, &rsel);
 		if (tx->sta)
 			tx->sta->last_txrate_idx = rsel.rate_idx;
 		tx->rate_idx = rsel.rate_idx;
@@ -1955,7 +1956,7 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 	skb->do_not_encrypt = 1;
 
 	info->band = band;
-	rate_control_get_rate(local->mdev, sband, skb, &rsel);
+	rate_control_get_rate(sdata, sband, NULL, skb, &rsel);
 
 	if (unlikely(rsel.rate_idx < 0)) {
 		if (net_ratelimit()) {
-- 
cgit v1.2.3


From 4dfe51e10047a60e82734860cec0d9f660b102fc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 19 Sep 2008 05:10:34 +0200
Subject: mac80211: probe with correct SSID

While associated, we should probe with the SSID we're associated to,
not the scan SSID.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 109c3a7e63a..52a64813301 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -942,8 +942,8 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 				disassoc = 1;
 			} else
 				ieee80211_send_probe_req(sdata, ifsta->bssid,
-							 local->scan_ssid,
-							 local->scan_ssid_len);
+							 ifsta->ssid,
+							 ifsta->ssid_len);
 			ifsta->flags ^= IEEE80211_STA_PROBEREQ_POLL;
 		} else {
 			ifsta->flags &= ~IEEE80211_STA_PROBEREQ_POLL;
-- 
cgit v1.2.3


From 4492bea656b70dad6a9ae7b59b1430fa38ba3345 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 22 Sep 2008 17:10:10 +0300
Subject: mac80211: fix led behavior in IBSS

This patch fixes the led behavior in IBSS. After we joined an IBSS cell we
need to inform the led that we got associated. Although there is no 802.11
association in IBSS mode, the semantic of "There is a link" is relevant.
This allows the led to blink in IBSS mode (at least this solves a bug for
iwlwifi).

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Reviewed-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 52a64813301..e859a0ab616 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1452,6 +1452,8 @@ static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED;
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
 
+	ieee80211_led_assoc(local, true);
+
 	memset(&wrqu, 0, sizeof(wrqu));
 	memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN);
 	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
-- 
cgit v1.2.3


From 638af07386972861272ed9d0cff01cad528fdceb Mon Sep 17 00:00:00 2001
From: Denis ChengRq <crquan@gmail.com>
Date: Tue, 23 Sep 2008 02:35:37 +0800
Subject: wireless: a global static to local static improvement

There are two improvements in this simple patch:
1. wiphy_counter is a static var only used in one function, so
   can use local static instead of global static;
2. wiphy_counter wrap handling killed one comparision;

Signed-off-by: Denis ChengRq <crquan@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/core.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index d6940085d59..5cadbeb76a1 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -34,7 +34,6 @@ MODULE_DESCRIPTION("wireless configuration support");
  * often because we need to do it for each command */
 LIST_HEAD(cfg80211_drv_list);
 DEFINE_MUTEX(cfg80211_drv_mutex);
-static int wiphy_counter;
 
 /* for debugfs */
 static struct dentry *ieee80211_debugfs_dir;
@@ -206,6 +205,8 @@ out_unlock:
 
 struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv)
 {
+	static int wiphy_counter;
+
 	struct cfg80211_registered_device *drv;
 	int alloc_size;
 
@@ -222,21 +223,18 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv)
 
 	mutex_lock(&cfg80211_drv_mutex);
 
-	drv->idx = wiphy_counter;
-
-	/* now increase counter for the next device unless
-	 * it has wrapped previously */
-	if (wiphy_counter >= 0)
-		wiphy_counter++;
-
-	mutex_unlock(&cfg80211_drv_mutex);
+	drv->idx = wiphy_counter++;
 
 	if (unlikely(drv->idx < 0)) {
+		wiphy_counter--;
+		mutex_unlock(&cfg80211_drv_mutex);
 		/* ugh, wrapped! */
 		kfree(drv);
 		return NULL;
 	}
 
+	mutex_unlock(&cfg80211_drv_mutex);
+
 	/* give it a proper name */
 	snprintf(drv->wiphy.dev.bus_id, BUS_ID_SIZE,
 		 PHY_NAME "%d", drv->idx);
-- 
cgit v1.2.3


From e851db5b05408b89b9a9429a66814b79fabee2a1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Jun 2008 18:45:30 -0400
Subject: SUNRPC: Add address family field to svc_serv data structure

Introduce and initialize an address family field in the svc_serv structure.

This field will determine what family to use for the service's listener
sockets and what families are advertised via the local rpcbind daemon.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5a32cb7c4bb..9ba17044109 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -357,7 +357,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	   void (*shutdown)(struct svc_serv *serv))
+	   sa_family_t family, void (*shutdown)(struct svc_serv *serv))
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -366,6 +366,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 	if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
+	serv->sv_family    = family;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
@@ -425,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
-		void (*shutdown)(struct svc_serv *serv))
+		sa_family_t family, void (*shutdown)(struct svc_serv *serv))
 {
-	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+	return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
 }
 EXPORT_SYMBOL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		void (*shutdown)(struct svc_serv *serv),
+		  sa_family_t family, void (*shutdown)(struct svc_serv *serv),
 		  svc_thread_fn func, struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, shutdown);
+	serv = __svc_create(prog, bufsize, npools, family, shutdown);
 
 	if (serv != NULL) {
 		serv->sv_function = func;
-- 
cgit v1.2.3


From 5dd248f6f1ffe1f691fd66749e2a3dc8f8eb7b5e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Jun 2008 18:45:37 -0400
Subject: SUNRPC: Use proper INADDR_ANY when setting up RPC services on IPv6

Teach svc_create_xprt() to use the correct ANY address for AF_INET6 based
RPC services.

No caller uses AF_INET6 yet.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc_xprt.c | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e46c825f495..bf5b5cdafeb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
 }
 EXPORT_SYMBOL_GPL(svc_xprt_init);
 
-int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
-		    int flags)
+static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
+					 struct svc_serv *serv,
+					 unsigned short port, int flags)
 {
-	struct svc_xprt_class *xcl;
 	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
 		.sin_addr.s_addr	= htonl(INADDR_ANY),
 		.sin_port		= htons(port),
 	};
+	struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+		.sin6_port		= htons(port),
+	};
+	struct sockaddr *sap;
+	size_t len;
+
+	switch (serv->sv_family) {
+	case AF_INET:
+		sap = (struct sockaddr *)&sin;
+		len = sizeof(sin);
+		break;
+	case AF_INET6:
+		sap = (struct sockaddr *)&sin6;
+		len = sizeof(sin6);
+		break;
+	default:
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
+	return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
+}
+
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+		    int flags)
+{
+	struct svc_xprt_class *xcl;
+
 	dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
 	spin_lock(&svc_xprt_class_lock);
 	list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
@@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
 			goto err;
 
 		spin_unlock(&svc_xprt_class_lock);
-		newxprt = xcl->xcl_ops->
-			xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
-				   flags);
+		newxprt = __svc_xpo_create(xcl, serv, port, flags);
 		if (IS_ERR(newxprt)) {
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
-- 
cgit v1.2.3


From b6632339e3afbcbb438a3c8935190ea22464fc99 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Aug 2008 19:33:44 -0400
Subject: SUNRPC: Set V6ONLY socket option for RPC listener sockets

My plan is to use an AF_INET listener on systems that support only IPv4,
and an AF_INET6 listener on systems that can support IPv6. Incoming
IPv4 packets will be posted to an AF_INET6 listener with a mapped IPv4
address.

Max Matveev <makc@sgi.com> says:
  Creating a single listener can be dangerous - if net.ipv6.bindv6only
  is enabled then it's possible to create another listener in v4
  namespace on the same port and steal the traffic from the "unifed"
  listener. You need to disable V6ONLY explicitly via a sockopt to stop
  that.

Set appropriate socket option on RPC server listener sockets to prevent
this.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svcsock.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 3e65719f1ef..f91377c1495 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	struct svc_sock	*svsk;
 	struct sock	*inet;
 	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+	int		val;
 
 	dprintk("svc: svc_setup_socket %p\n", sock);
 	if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	else
 		svc_tcp_init(svsk, serv);
 
+	/*
+	 * We start one listener per sv_serv.  We want AF_INET
+	 * requests to be automatically shunted to our AF_INET6
+	 * listener using a mapped IPv4 address.  Make sure
+	 * no-one starts an equivalent IPv4 listener, which
+	 * would steal our incoming connections.
+	 */
+	val = 0;
+	if (serv->sv_family == AF_INET6)
+		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+					(char *)&val, sizeof(val));
+
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
 
-- 
cgit v1.2.3


From 14aeb2118d6e9fd9ee988324c740a00c80979093 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Aug 2008 19:34:00 -0400
Subject: SUNRPC: Simplify rpcb_register() API

Bruce suggested there's no need to expose the difference between an error
sending the PMAP_SET request and an error reply from the portmapper to
rpcb_register's callers.  The user space equivalent of rpcb_register() is
pmap_set(3), which returns a bool_t : either the PMAP set worked, or it
didn't.  Simple.

So let's remove the "*okay" argument from rpcb_register() and
rpcb_v4_register(), and simply return an error if any part of the call
didn't work.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/rpcb_clnt.c | 65 ++++++++++++++++++++++----------------------------
 net/sunrpc/svc.c       |  8 ++-----
 2 files changed, 30 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 24db2b4d12d..cc7250d4659 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -176,13 +176,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 }
 
 static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
-			      u32 version, struct rpc_message *msg,
-			      int *result)
+			      u32 version, struct rpc_message *msg)
 {
 	struct rpc_clnt *rpcb_clnt;
-	int error = 0;
+	int result, error = 0;
 
-	*result = 0;
+	msg->rpc_resp = &result;
 
 	rpcb_clnt = rpcb_create_local(addr, addrlen, version);
 	if (!IS_ERR(rpcb_clnt)) {
@@ -191,12 +190,19 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
 	} else
 		error = PTR_ERR(rpcb_clnt);
 
-	if (error < 0)
+	if (error < 0) {
 		printk(KERN_WARNING "RPC: failed to contact local rpcbind "
 				"server (errno %d).\n", -error);
-	dprintk("RPC:       registration status %d/%d\n", error, *result);
+		return error;
+	}
+
+	if (!result) {
+		dprintk("RPC:       registration failed\n");
+		return -EACCES;
+	}
 
-	return error;
+	dprintk("RPC:       registration succeeded\n");
+	return 0;
 }
 
 /**
@@ -205,7 +211,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
  * @vers: RPC version number to bind
  * @prot: transport protocol to register
  * @port: port value to register
- * @okay: OUT: result code
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success.  Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
  *
  * RPC services invoke this function to advertise their contact
  * information via the system's rpcbind daemon.  RPC services
@@ -217,15 +227,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
  * all registered transports for [program, version] from the local
  * rpcbind database.
  *
- * Returns zero if the registration request was dispatched
- * successfully and a reply was received.  The rpcbind daemon's
- * boolean result code is stored in *okay.
- *
- * Returns an errno value and sets *result to zero if there was
- * some problem that prevented the rpcbind request from being
- * dispatched, or if the rpcbind daemon did not respond within
- * the timeout.
- *
  * This function uses rpcbind protocol version 2 to contact the
  * local rpcbind daemon.
  *
@@ -236,7 +237,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
  * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6
  * addresses).
  */
-int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
+int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
 {
 	struct rpcbind_args map = {
 		.r_prog		= prog,
@@ -246,7 +247,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 	};
 	struct rpc_message msg = {
 		.rpc_argp	= &map,
-		.rpc_resp	= okay,
 	};
 
 	dprintk("RPC:       %sregistering (%u, %u, %d, %u) with local "
@@ -259,7 +259,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 
 	return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
 					sizeof(rpcb_inaddr_loopback),
-					RPCBVERS_2, &msg, okay);
+					RPCBVERS_2, &msg);
 }
 
 /*
@@ -290,7 +290,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
 
 	return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
 					sizeof(rpcb_inaddr_loopback),
-					RPCBVERS_4, msg, msg->rpc_resp);
+					RPCBVERS_4, msg);
 }
 
 /*
@@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
 
 	return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
 					sizeof(rpcb_in6addr_loopback),
-					RPCBVERS_4, msg, msg->rpc_resp);
+					RPCBVERS_4, msg);
 }
 
 /**
@@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
  * @version: RPC version number of service to (un)register
  * @address: address family, IP address, and port to (un)register
  * @netid: netid of transport protocol to (un)register
- * @result: result code from rpcbind RPC call
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success.  Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
  *
  * RPC services invoke this function to advertise their contact
  * information via the system's rpcbind daemon.  RPC services
@@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
  * to zero.  Callers pass a netid of "" to unregister all
  * transport netids associated with [program, version, address].
  *
- * Returns zero if the registration request was dispatched
- * successfully and a reply was received.  The rpcbind daemon's
- * result code is stored in *result.
- *
- * Returns an errno value and sets *result to zero if there was
- * some problem that prevented the rpcbind request from being
- * dispatched, or if the rpcbind daemon did not respond within
- * the timeout.
- *
  * This function uses rpcbind protocol version 4 to contact the
  * local rpcbind daemon.  The local rpcbind daemon must support
  * version 4 of the rpcbind protocol in order for these functions
@@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
  * advertises the service on all IPv4 and IPv6 addresses.
  */
 int rpcb_v4_register(const u32 program, const u32 version,
-		     const struct sockaddr *address, const char *netid,
-		     int *result)
+		     const struct sockaddr *address, const char *netid)
 {
 	struct rpcbind_args map = {
 		.r_prog		= program,
@@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version,
 	};
 	struct rpc_message msg = {
 		.rpc_argp	= &map,
-		.rpc_resp	= result,
 	};
 
-	*result = 0;
-
 	switch (address->sa_family) {
 	case AF_INET:
 		return rpcb_register_netid4((struct sockaddr_in *)address,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 9ba17044109..9805143d066 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -730,7 +730,7 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
 	struct svc_program	*progp;
 	unsigned long		flags;
 	unsigned int		i;
-	int			error = 0, dummy;
+	int			error = 0;
 
 	if (!port)
 		clear_thread_flag(TIF_SIGPENDING);
@@ -751,13 +751,9 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
 			if (progp->pg_vers[i]->vs_hidden)
 				continue;
 
-			error = rpcb_register(progp->pg_prog, i, proto, port, &dummy);
+			error = rpcb_register(progp->pg_prog, i, proto, port);
 			if (error < 0)
 				break;
-			if (port && !dummy) {
-				error = -EACCES;
-				break;
-			}
 		}
 	}
 
-- 
cgit v1.2.3


From 7252d575ab0e8771269a3d245c36a05ace5152bd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Aug 2008 19:34:08 -0400
Subject: SUNRPC: Split portmap unregister API into separate function

Create a separate server-level interface for unregistering RPC services.

The mechanics of, and the API for, registering and unregistering RPC
services will diverge further as support for IPv6 is added.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 9805143d066..9eb78a771da 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -28,6 +28,8 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCDSP
 
+static void svc_unregister(const struct svc_serv *serv);
+
 #define svc_serv_is_pooled(serv)    ((serv)->sv_function)
 
 /*
@@ -417,9 +419,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		spin_lock_init(&pool->sp_lock);
 	}
 
-
 	/* Remove any stale portmap registrations */
-	svc_register(serv, 0, 0);
+	svc_unregister(serv);
 
 	return serv;
 }
@@ -487,8 +488,7 @@ svc_destroy(struct svc_serv *serv)
 	if (svc_serv_is_pooled(serv))
 		svc_pool_map_put();
 
-	/* Unregister service with the portmapper */
-	svc_register(serv, 0, 0);
+	svc_unregister(serv);
 	kfree(serv->sv_pools);
 	kfree(serv);
 }
@@ -728,12 +728,10 @@ int
 svc_register(struct svc_serv *serv, int proto, unsigned short port)
 {
 	struct svc_program	*progp;
-	unsigned long		flags;
 	unsigned int		i;
 	int			error = 0;
 
-	if (!port)
-		clear_thread_flag(TIF_SIGPENDING);
+	BUG_ON(proto == 0 && port == 0);
 
 	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
 		for (i = 0; i < progp->pg_nvers; i++) {
@@ -757,13 +755,53 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
 		}
 	}
 
-	if (!port) {
-		spin_lock_irqsave(&current->sighand->siglock, flags);
-		recalc_sigpending();
-		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	return error;
+}
+
+/*
+ * All transport protocols and ports for this service are removed
+ * from the local rpcbind database if the service is not hidden.
+ *
+ * The result of unregistration is reported via dprintk for those
+ * who want verification of the result, but is otherwise not
+ * important.
+ *
+ * The local rpcbind daemon listens on either only IPv6 or only
+ * IPv4.  The kernel can't tell how it's configured.  However,
+ * AF_INET addresses are mapped to AF_INET6 in IPv6-only config-
+ * urations, so even an unregistration request on AF_INET will
+ * get to a local rpcbind daemon listening only on AF_INET6.  So
+ * we always unregister via AF_INET.
+ *
+ * At this point we don't need rpcbind version 4 for unregis-
+ * tration:  A v2 UNSET request will clear all transports (netids),
+ * addresses, and address families for [program, version].
+ */
+static void svc_unregister(const struct svc_serv *serv)
+{
+	struct svc_program *progp;
+	unsigned long flags;
+	unsigned int i;
+	int error;
+
+	clear_thread_flag(TIF_SIGPENDING);
+
+	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+		for (i = 0; i < progp->pg_nvers; i++) {
+			if (progp->pg_vers[i] == NULL)
+				continue;
+			if (progp->pg_vers[i]->vs_hidden)
+				continue;
+
+			error = rpcb_register(progp->pg_prog, i, 0, 0);
+			dprintk("svc: svc_unregister(%sv%u), error %d\n",
+					progp->pg_name, i, error);
+		}
 	}
 
-	return error;
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
 
 /*
-- 
cgit v1.2.3


From a26cfad6e0a308a2c68df1f1ef50aabd48b17e6d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Aug 2008 19:34:16 -0400
Subject: SUNRPC: Support IPv6 when registering kernel RPC services

In order to advertise NFS-related services on IPv6 interfaces via
rpcbind, the kernel RPC server implementation must use
rpcb_v4_register() instead of rpcb_register().

A new kernel build option allows distributions to use the legacy
v2 call until they integrate an appropriate user-space rpcbind
daemon that can support IPv6 RPC services.

I tried adding some automatic logic to fall back if registering
with a v4 protocol request failed, but there are too many corner
cases.  So I just made it a compile-time switch that distributions
can throw when they've replaced portmapper with rpcbind.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 9eb78a771da..c43ccb62805 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -719,13 +719,92 @@ svc_exit_thread(struct svc_rqst *rqstp)
 }
 EXPORT_SYMBOL(svc_exit_thread);
 
+#ifdef CONFIG_SUNRPC_REGISTER_V4
 /*
- * Register an RPC service with the local portmapper.
- * To unregister a service, call this routine with
- * proto and port == 0.
+ * Registering kernel RPC services with rpcbind version 2 will work
+ * over either IPv4 or IPv6, since the Linux kernel always registers
+ * services for the "any" address.
+ *
+ * However, the local rpcbind daemon listens on either only AF_INET
+ * or AF_INET6 (never both).  When it listens on AF_INET6, an rpcbind
+ * version 2 registration will result in registering the service at
+ * IN6ADDR_ANY, even if the RPC service being registered is not
+ * IPv6-enabled.
+ *
+ * Rpcbind version 4 allows us to be a little more specific.  Kernel
+ * RPC services that don't yet support AF_INET6 can register
+ * themselves as IPv4-only with the local rpcbind daemon, even if the
+ * daemon is listening only on AF_INET6.
+ *
+ * And, registering IPv6-enabled kernel RPC services via AF_INET6
+ * verifies that the local user space rpcbind daemon is properly
+ * configured to support remote AF_INET6 rpcbind requests.
+ *
+ * An AF_INET6 registration request will fail if the local rpcbind
+ * daemon is not set up to listen on AF_INET6.  Likewise, we fail
+ * AF_INET6 registration requests if svc_register() is configured to
+ * support only rpcbind version 2.
  */
-int
-svc_register(struct svc_serv *serv, int proto, unsigned short port)
+static int __svc_register(const u32 program, const u32 version,
+			  const sa_family_t family,
+			  const unsigned short protocol,
+			  const unsigned short port)
+{
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_ANY),
+		.sin_port		= htons(port),
+	};
+	struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+		.sin6_port		= htons(port),
+	};
+	struct sockaddr *sap;
+	char *netid;
+
+	switch (family) {
+	case AF_INET:
+		sap = (struct sockaddr *)&sin;
+		netid = RPCBIND_NETID_TCP;
+		if (protocol == IPPROTO_UDP)
+			netid = RPCBIND_NETID_UDP;
+		break;
+	case AF_INET6:
+		sap = (struct sockaddr *)&sin6;
+		netid = RPCBIND_NETID_TCP6;
+		if (protocol == IPPROTO_UDP)
+			netid = RPCBIND_NETID_UDP6;
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	return rpcb_v4_register(program, version, sap, netid);
+}
+#else
+static int __svc_register(const u32 program, const u32 version,
+			  sa_family_t family,
+			  const unsigned short protocol,
+			  const unsigned short port)
+{
+	if (family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	return rpcb_register(program, version, protocol, port);
+}
+#endif
+
+/**
+ * svc_register - register an RPC service with the local portmapper
+ * @serv: svc_serv struct for the service to register
+ * @proto: transport protocol number to advertise
+ * @port: port to advertise
+ *
+ * Service is registered for any address in serv's address family
+ */
+int svc_register(const struct svc_serv *serv, const unsigned short proto,
+		 const unsigned short port)
 {
 	struct svc_program	*progp;
 	unsigned int		i;
@@ -738,8 +817,9 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
 			if (progp->pg_vers[i] == NULL)
 				continue;
 
-			dprintk("svc: svc_register(%s, %s, %d, %d)%s\n",
+			dprintk("svc: svc_register(%s, %u, %s, %u, %d)%s\n",
 					progp->pg_name,
+					serv->sv_family,
 					proto == IPPROTO_UDP?  "udp" : "tcp",
 					port,
 					i,
@@ -749,7 +829,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
 			if (progp->pg_vers[i]->vs_hidden)
 				continue;
 
-			error = rpcb_register(progp->pg_prog, i, proto, port);
+			error = __svc_register(progp->pg_prog, i,
+						serv->sv_family, proto, port);
 			if (error < 0)
 				break;
 		}
-- 
cgit v1.2.3


From 2c7eb0b206b8408d92c518033a359f4374c75314 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 15 Sep 2008 16:27:23 -0500
Subject: SUNRPC: Register both netids for AF_INET6 servers

TI-RPC is a user-space library of RPC functions that replaces ONC RPC
and allows RPC to operate in the new world of IPv6.

TI-RPC combines the concept of a transport protocol (UDP and TCP)
and a protocol family (PF_INET and PF_INET6) into a single identifier
called a "netid."  For example, "udp" means UDP over IPv4, and "udp6"
means UDP over IPv6.

For rpcbind, then, the RPC service tuple that is registered and
advertised is:

  [RPC program, RPC version, service address and port, netid]

instead of

  [RPC program, RPC version, port, protocol]

Service address is typically ANYADDR, but can be a specific address
of one of the interfaces on a multi-homed host.  The third item in
the new tuple is expressed as a universal address.

The current Linux rpcbind implementation registers a netid for both
protocol families when RPCB_SET is done for just the PF_INET6 version
of the netid (ie udp6 or tcp6).  So registering "udp6" causes a
registration for "udp" to appear automatically as well.

We've recently determined that this is incorrect behavior.  In the
TI-RPC world, "udp6" is not meant to imply that the registered RPC
service handles requests from AF_INET as well, even if the listener
socket does address mapping.  "udp" and "udp6" are entirely separate
capabilities, and must be registered separately.

The Linux kernel, unlike TI-RPC, leverages address mapping to allow a
single listener socket to handle requests for both AF_INET and AF_INET6.
This is still OK, but the kernel currently assumes registering "udp6"
will cover "udp" as well.  It registers only "udp6" for it's AF_INET6
services, even though they handle both AF_INET and AF_INET6 on the same
port.

So svc_register() actually needs to register both "udp" and "udp6"
explicitly (and likewise for TCP).  Until rpcbind is fixed, the
kernel can ignore the return code for the second RPCB_SET call.

Please merge this with commit 15231312:

    SUNRPC: Support IPv6 when registering kernel RPC services

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Olaf Kirch <okir@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc.c | 143 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 100 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c43ccb62805..b8d2fcd0f71 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -720,69 +720,125 @@ svc_exit_thread(struct svc_rqst *rqstp)
 EXPORT_SYMBOL(svc_exit_thread);
 
 #ifdef CONFIG_SUNRPC_REGISTER_V4
+
 /*
- * Registering kernel RPC services with rpcbind version 2 will work
- * over either IPv4 or IPv6, since the Linux kernel always registers
- * services for the "any" address.
- *
- * However, the local rpcbind daemon listens on either only AF_INET
- * or AF_INET6 (never both).  When it listens on AF_INET6, an rpcbind
- * version 2 registration will result in registering the service at
- * IN6ADDR_ANY, even if the RPC service being registered is not
- * IPv6-enabled.
+ * Register an "inet" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
  *
- * Rpcbind version 4 allows us to be a little more specific.  Kernel
- * RPC services that don't yet support AF_INET6 can register
- * themselves as IPv4-only with the local rpcbind daemon, even if the
- * daemon is listening only on AF_INET6.
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
  *
- * And, registering IPv6-enabled kernel RPC services via AF_INET6
- * verifies that the local user space rpcbind daemon is properly
- * configured to support remote AF_INET6 rpcbind requests.
- *
- * An AF_INET6 registration request will fail if the local rpcbind
- * daemon is not set up to listen on AF_INET6.  Likewise, we fail
- * AF_INET6 registration requests if svc_register() is configured to
- * support only rpcbind version 2.
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
  */
-static int __svc_register(const u32 program, const u32 version,
-			  const sa_family_t family,
-			  const unsigned short protocol,
-			  const unsigned short port)
+static int __svc_rpcb_register4(const u32 program, const u32 version,
+				const unsigned short protocol,
+				const unsigned short port)
 {
 	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
 		.sin_addr.s_addr	= htonl(INADDR_ANY),
 		.sin_port		= htons(port),
 	};
+	char *netid;
+
+	switch (protocol) {
+	case IPPROTO_UDP:
+		netid = RPCBIND_NETID_UDP;
+		break;
+	case IPPROTO_TCP:
+		netid = RPCBIND_NETID_TCP;
+		break;
+	default:
+		return -EPROTONOSUPPORT;
+	}
+
+	return rpcb_v4_register(program, version,
+				(struct sockaddr *)&sin, netid);
+}
+
+/*
+ * Register an "inet6" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_rpcb_register6(const u32 program, const u32 version,
+				const unsigned short protocol,
+				const unsigned short port)
+{
 	struct sockaddr_in6 sin6 = {
 		.sin6_family		= AF_INET6,
 		.sin6_addr		= IN6ADDR_ANY_INIT,
 		.sin6_port		= htons(port),
 	};
-	struct sockaddr *sap;
 	char *netid;
 
-	switch (family) {
-	case AF_INET:
-		sap = (struct sockaddr *)&sin;
-		netid = RPCBIND_NETID_TCP;
-		if (protocol == IPPROTO_UDP)
-			netid = RPCBIND_NETID_UDP;
+	switch (protocol) {
+	case IPPROTO_UDP:
+		netid = RPCBIND_NETID_UDP6;
 		break;
-	case AF_INET6:
-		sap = (struct sockaddr *)&sin6;
+	case IPPROTO_TCP:
 		netid = RPCBIND_NETID_TCP6;
-		if (protocol == IPPROTO_UDP)
-			netid = RPCBIND_NETID_UDP6;
 		break;
 	default:
-		return -EAFNOSUPPORT;
+		return -EPROTONOSUPPORT;
+	}
+
+	return rpcb_v4_register(program, version,
+				(struct sockaddr *)&sin6, netid);
+}
+
+/*
+ * Register a kernel RPC service via rpcbind version 4.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(const u32 program, const u32 version,
+			  const sa_family_t family,
+			  const unsigned short protocol,
+			  const unsigned short port)
+{
+	int error;
+
+	switch (family) {
+	case AF_INET:
+		return __svc_rpcb_register4(program, version,
+						protocol, port);
+	case AF_INET6:
+		error = __svc_rpcb_register6(program, version,
+						protocol, port);
+		if (error < 0)
+			return error;
+
+		/*
+		 * Work around bug in some versions of Linux rpcbind
+		 * which don't allow registration of both inet and
+		 * inet6 netids.
+		 *
+		 * Error return ignored for now.
+		 */
+		__svc_rpcb_register4(program, version,
+						protocol, port);
+		return 0;
 	}
 
-	return rpcb_v4_register(program, version, sap, netid);
+	return -EAFNOSUPPORT;
 }
-#else
+
+#else	/* CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
+ * Register a kernel RPC service via rpcbind version 2.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
 static int __svc_register(const u32 program, const u32 version,
 			  sa_family_t family,
 			  const unsigned short protocol,
@@ -793,7 +849,8 @@ static int __svc_register(const u32 program, const u32 version,
 
 	return rpcb_register(program, version, protocol, port);
 }
-#endif
+
+#endif /* CONFIG_SUNRPC_REGISTER_V4 */
 
 /**
  * svc_register - register an RPC service with the local portmapper
@@ -817,12 +874,12 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 			if (progp->pg_vers[i] == NULL)
 				continue;
 
-			dprintk("svc: svc_register(%s, %u, %s, %u, %d)%s\n",
+			dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
 					progp->pg_name,
-					serv->sv_family,
+					i,
 					proto == IPPROTO_UDP?  "udp" : "tcp",
 					port,
-					i,
+					serv->sv_family,
 					progp->pg_vers[i]->vs_hidden?
 						" (but not telling portmap)" : "");
 
-- 
cgit v1.2.3


From 9d548b9c955c0709d1229d21d0bc14afa6b356de Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 15 Sep 2008 16:27:30 -0500
Subject: SUNRPC: Use short-hand IPv6 ANYADDR for RPCB_SET

Clean up: When doing an RPCB_SET, make the kernel's rpcb client use the
shorthand "::" for the universal form of the IPv6 ANY address.

Without this patch, rpcbind will advertise:

  0000:0000:0000:0000:0000:0000:0000:0000.x.y

This is cosmetic only.  It cleans up the display of information from
/sbin/rpcinfo.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/rpcb_clnt.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index cc7250d4659..0fa1086cf99 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -20,6 +20,7 @@
 #include <linux/in6.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <net/ipv6.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
@@ -304,10 +305,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
 	char buf[64];
 
 	/* Construct AF_INET6 universal address */
-	snprintf(buf, sizeof(buf),
-			NIP6_FMT".%u.%u",
-			NIP6(address_to_register->sin6_addr),
-			port >> 8, port & 0xff);
+	if (ipv6_addr_any(&address_to_register->sin6_addr))
+		snprintf(buf, sizeof(buf), "::.%u.%u",
+				port >> 8, port & 0xff);
+	else
+		snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u",
+				NIP6(address_to_register->sin6_addr),
+				port >> 8, port & 0xff);
 	map->r_addr = buf;
 
 	dprintk("RPC:       %sregistering [%u, %u, %s, '%s'] with "
-- 
cgit v1.2.3


From f6fb3f6f591b50fa4f51962ad06ee0d8782e1bc8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 25 Sep 2008 11:56:57 -0400
Subject: SUNRPC: Fix up svc_unregister()

With the new rpcbind code, a PMAP_UNSET will not have any effect on
services registered via rpcbind v3 or v4.

Implement a version of svc_unregister() that uses an RPCB_UNSET with
an empty netid string to make sure we have cleared *all* entries for
a kernel RPC service when shutting down, or before starting a fresh
instance of the service.

Use the new version only when CONFIG_SUNRPC_REGISTER_V4 is enabled;
otherwise, the legacy PMAP version is used to ensure complete
backwards-compatibility with the Linux portmapper daemon.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc.c | 58 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b8d2fcd0f71..54c98d87684 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -896,31 +896,51 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 	return error;
 }
 
+#ifdef CONFIG_SUNRPC_REGISTER_V4
+
+static void __svc_unregister(const u32 program, const u32 version,
+			     const char *progname)
+{
+	struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+		.sin6_port		= 0,
+	};
+	int error;
+
+	error = rpcb_v4_register(program, version,
+				(struct sockaddr *)&sin6, "");
+	dprintk("svc: %s(%sv%u), error %d\n",
+			__func__, progname, version, error);
+}
+
+#else	/* CONFIG_SUNRPC_REGISTER_V4 */
+
+static void __svc_unregister(const u32 program, const u32 version,
+			     const char *progname)
+{
+	int error;
+
+	error = rpcb_register(program, version, 0, 0);
+	dprintk("svc: %s(%sv%u), error %d\n",
+			__func__, progname, version, error);
+}
+
+#endif	/* CONFIG_SUNRPC_REGISTER_V4 */
+
 /*
- * All transport protocols and ports for this service are removed
- * from the local rpcbind database if the service is not hidden.
- *
- * The result of unregistration is reported via dprintk for those
- * who want verification of the result, but is otherwise not
- * important.
+ * All netids, bind addresses and ports registered for [program, version]
+ * are removed from the local rpcbind database (if the service is not
+ * hidden) to make way for a new instance of the service.
  *
- * The local rpcbind daemon listens on either only IPv6 or only
- * IPv4.  The kernel can't tell how it's configured.  However,
- * AF_INET addresses are mapped to AF_INET6 in IPv6-only config-
- * urations, so even an unregistration request on AF_INET will
- * get to a local rpcbind daemon listening only on AF_INET6.  So
- * we always unregister via AF_INET.
- *
- * At this point we don't need rpcbind version 4 for unregis-
- * tration:  A v2 UNSET request will clear all transports (netids),
- * addresses, and address families for [program, version].
+ * The result of unregistration is reported via dprintk for those who want
+ * verification of the result, but is otherwise not important.
  */
 static void svc_unregister(const struct svc_serv *serv)
 {
 	struct svc_program *progp;
 	unsigned long flags;
 	unsigned int i;
-	int error;
 
 	clear_thread_flag(TIF_SIGPENDING);
 
@@ -931,9 +951,7 @@ static void svc_unregister(const struct svc_serv *serv)
 			if (progp->pg_vers[i]->vs_hidden)
 				continue;
 
-			error = rpcb_register(progp->pg_prog, i, 0, 0);
-			dprintk("svc: svc_unregister(%sv%u), error %d\n",
-					progp->pg_name, i, error);
+			__svc_unregister(progp->pg_prog, i, progp->pg_name);
 		}
 	}
 
-- 
cgit v1.2.3


From db820d6376aa81accf5b648651e160fd76e363e2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 25 Sep 2008 11:57:05 -0400
Subject: SUNRPC: Clean up debug messages in rpcb_clnt.c

The RPCB XDR functions are used for multiple procedures.  For instance,
rpcb_encode_getaddr() is used for RPCB_GETADDR, RPCB_SET, and
RPCB_UNSET.  Make the XDR debug messages more generic so they are less
confusing.

And, unlike in other RPC consumers in the kernel, a single debug flag
enables all levels of debug messages in the RPC bind client, including
XDR debug messages.  Since the XDR decoders already report success or
failure in this case, remove redundant debug messages in the mid-level
rpcb_register_call() function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/rpcb_clnt.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 0fa1086cf99..34abc91058d 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -197,12 +197,8 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
 		return error;
 	}
 
-	if (!result) {
-		dprintk("RPC:       registration failed\n");
+	if (!result)
 		return -EACCES;
-	}
-
-	dprintk("RPC:       registration succeeded\n");
 	return 0;
 }
 
@@ -628,7 +624,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
 static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p,
 			       struct rpcbind_args *rpcb)
 {
-	dprintk("RPC:       rpcb_encode_mapping(%u, %u, %d, %u)\n",
+	dprintk("RPC:       encoding rpcb request (%u, %u, %d, %u)\n",
 			rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
 	*p++ = htonl(rpcb->r_prog);
 	*p++ = htonl(rpcb->r_vers);
@@ -643,7 +639,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p,
 			       unsigned short *portp)
 {
 	*portp = (unsigned short) ntohl(*p++);
-	dprintk("RPC:       rpcb_decode_getport result %u\n",
+	dprintk("RPC:       rpcb getport result: %u\n",
 			*portp);
 	return 0;
 }
@@ -652,7 +648,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
 			   unsigned int *boolp)
 {
 	*boolp = (unsigned int) ntohl(*p++);
-	dprintk("RPC:       rpcb_decode_set: call %s\n",
+	dprintk("RPC:       rpcb set/unset call %s\n",
 			(*boolp ? "succeeded" : "failed"));
 	return 0;
 }
@@ -660,7 +656,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
 static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p,
 			       struct rpcbind_args *rpcb)
 {
-	dprintk("RPC:       rpcb_encode_getaddr(%u, %u, %s)\n",
+	dprintk("RPC:       encoding rpcb request (%u, %u, %s)\n",
 			rpcb->r_prog, rpcb->r_vers, rpcb->r_addr);
 	*p++ = htonl(rpcb->r_prog);
 	*p++ = htonl(rpcb->r_vers);
-- 
cgit v1.2.3


From d5b337b4877f7c4e1d761434ee04d045b0201e03 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 28 Sep 2008 09:21:26 +0300
Subject: nfsd: use nfs client rpc callback program

since commit ff7d9756b501744540be65e172d27ee321d86103
"nfsd: use static memory for callback program and stats"
do_probe_callback uses a static callback program
(NFS4_CALLBACK) rather than the one set in clp->cl_callback.cb_prog
as passed in by the client in setclientid (4.0)
or create_session (4.1).

This patches introduces rpc_create_args.prognumber that allows
overriding program->number when creating rpc_clnt.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/clnt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 76739e928d0..da0789fa1b8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
 	clnt->cl_procinfo = version->procs;
 	clnt->cl_maxproc  = version->nrprocs;
 	clnt->cl_protname = program->name;
-	clnt->cl_prog     = program->number;
+	clnt->cl_prog     = args->prognumber ? : program->number;
 	clnt->cl_vers     = version->number;
 	clnt->cl_stats    = program->stats;
 	clnt->cl_metrics  = rpc_alloc_iostats(clnt);
-- 
cgit v1.2.3


From cf04a4c764cd3e651a64b3e667bb6a673ead99e1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 30 Sep 2008 02:22:14 -0700
Subject: netdev: use const for some name functions

dev_change_name and netdev_drivername should use const char on
parameters that are read-only input values. The strcpy to newname is
not needed since newname is not used later in function.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index a90737fe247..64f0d5b7cdf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -890,7 +890,7 @@ int dev_alloc_name(struct net_device *dev, const char *name)
  *	Change name of a device, can pass format strings "eth%d".
  *	for wildcarding.
  */
-int dev_change_name(struct net_device *dev, char *newname)
+int dev_change_name(struct net_device *dev, const char *newname)
 {
 	char oldname[IFNAMSIZ];
 	int err = 0;
@@ -916,7 +916,6 @@ int dev_change_name(struct net_device *dev, char *newname)
 		err = dev_alloc_name(dev, newname);
 		if (err < 0)
 			return err;
-		strcpy(newname, dev->name);
 	}
 	else if (__dev_get_by_name(net, newname))
 		return -EEXIST;
@@ -4754,10 +4753,10 @@ err_name:
 	return -ENOMEM;
 }
 
-char *netdev_drivername(struct net_device *dev, char *buffer, int len)
+char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
 {
-	struct device_driver *driver;
-	struct device *parent;
+	const struct device_driver *driver;
+	const struct device *parent;
 
 	if (len <= 0 || !buffer)
 		return buffer;
-- 
cgit v1.2.3


From f0db275a81ef184293ca7ef3646fe065b336efb7 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 30 Sep 2008 02:23:58 -0700
Subject: netdev: docbook comment update (revised)

Add more docbook comments to network device functions and cleanup
the comments.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 64f0d5b7cdf..2cc258b7acc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -956,6 +956,7 @@ rollback:
  *	dev_set_alias - change ifalias of a device
  *	@dev: device
  *	@alias: name up to IFALIASZ
+ *	@len: limit of bytes to copy from info
  *
  *	Set ifalias for a device,
  */
@@ -3330,6 +3331,12 @@ static void dev_addr_discard(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/**
+ *	dev_get_flags - get flags reported to userspace
+ *	@dev: device
+ *
+ *	Get the combination of flag bits exported through APIs to userspace.
+ */
 unsigned dev_get_flags(const struct net_device *dev)
 {
 	unsigned flags;
@@ -3354,6 +3361,14 @@ unsigned dev_get_flags(const struct net_device *dev)
 	return flags;
 }
 
+/**
+ *	dev_change_flags - change device settings
+ *	@dev: device
+ *	@flags: device state flags
+ *
+ *	Change settings on device based state flags. The flags are
+ *	in the userspace exported format.
+ */
 int dev_change_flags(struct net_device *dev, unsigned flags)
 {
 	int ret, changes;
@@ -3423,6 +3438,13 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	return ret;
 }
 
+/**
+ *	dev_set_mtu - Change maximum transfer unit
+ *	@dev: device
+ *	@new_mtu: new transfer unit
+ *
+ *	Change the maximum transfer size of the network device.
+ */
 int dev_set_mtu(struct net_device *dev, int new_mtu)
 {
 	int err;
@@ -3447,6 +3469,13 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	return err;
 }
 
+/**
+ *	dev_set_mac_address - Change Media Access Control Address
+ *	@dev: device
+ *	@sa: new address
+ *
+ *	Change the hardware (MAC) address of the device
+ */
 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 {
 	int err;
@@ -4350,7 +4379,12 @@ void free_netdev(struct net_device *dev)
 	put_device(&dev->dev);
 }
 
-/* Synchronize with packet receive processing. */
+/**
+ *	synchronize_net -  Synchronize with packet receive processing
+ *
+ *	Wait for packets currently being received to be done.
+ *	Does not block later packets from starting.
+ */
 void synchronize_net(void)
 {
 	might_sleep();
@@ -4652,7 +4686,7 @@ netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 }
 
 /**
- * netdev_dma_regiser - register the networking subsystem as a DMA client
+ * netdev_dma_register - register the networking subsystem as a DMA client
  */
 static int __init netdev_dma_register(void)
 {
@@ -4753,6 +4787,14 @@ err_name:
 	return -ENOMEM;
 }
 
+/**
+ *	netdev_drivername - network driver for the device
+ *	@dev: network device
+ *	@buffer: buffer for resulting name
+ *	@len: size of buffer
+ *
+ *	Determine network driver for device.
+ */
 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
 {
 	const struct device_driver *driver;
-- 
cgit v1.2.3


From 8980713b97393b21a50d11408a22d2caa87d016a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi.denis-courmont@nokia.com>
Date: Tue, 30 Sep 2008 02:51:18 -0700
Subject: Phonet: Netlink factorization and cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/pn_netlink.c | 91 +++++++++++++++++++------------------------------
 1 file changed, 35 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index b1ea19a230d..b1770d66bc8 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -54,11 +54,16 @@ errout:
 		rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
 }
 
-static int newaddr_doit(struct sk_buff *skb, struct nlmsghdr *nlm, void *attr)
+static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = {
+	[IFA_LOCAL] = { .type = NLA_U8 },
+};
+
+static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr)
 {
-	struct rtattr **rta = attr;
-	struct ifaddrmsg *ifm = NLMSG_DATA(nlm);
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
 	struct net_device *dev;
+	struct ifaddrmsg *ifm;
 	int err;
 	u8 pnaddr;
 
@@ -67,52 +72,28 @@ static int newaddr_doit(struct sk_buff *skb, struct nlmsghdr *nlm, void *attr)
 
 	ASSERT_RTNL();
 
-	if (rta[IFA_LOCAL - 1] == NULL)
-		return -EINVAL;
-
-	dev = __dev_get_by_index(&init_net, ifm->ifa_index);
-	if (dev == NULL)
-		return -ENODEV;
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy);
+	if (err < 0)
+		return err;
 
-	if (ifm->ifa_prefixlen > 0)
+	ifm = nlmsg_data(nlh);
+	if (tb[IFA_LOCAL] == NULL)
 		return -EINVAL;
-
-	memcpy(&pnaddr, RTA_DATA(rta[IFA_LOCAL - 1]), 1);
-
-	err = phonet_address_add(dev, pnaddr);
-	if (!err)
-		rtmsg_notify(RTM_NEWADDR, dev, pnaddr);
-	return err;
-}
-
-static int deladdr_doit(struct sk_buff *skb, struct nlmsghdr *nlm, void *attr)
-{
-	struct rtattr **rta = attr;
-	struct ifaddrmsg *ifm = NLMSG_DATA(nlm);
-	struct net_device *dev;
-	int err;
-	u8 pnaddr;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	ASSERT_RTNL();
-
-	if (rta[IFA_LOCAL - 1] == NULL)
+	pnaddr = nla_get_u8(tb[IFA_LOCAL]);
+	if (pnaddr & 3)
+		/* Phonet addresses only have 6 high-order bits */
 		return -EINVAL;
 
-	dev = __dev_get_by_index(&init_net, ifm->ifa_index);
+	dev = __dev_get_by_index(net, ifm->ifa_index);
 	if (dev == NULL)
 		return -ENODEV;
 
-	if (ifm->ifa_prefixlen > 0)
-		return -EADDRNOTAVAIL;
-
-	memcpy(&pnaddr, RTA_DATA(rta[IFA_LOCAL - 1]), 1);
-
-	err = phonet_address_del(dev, pnaddr);
+	if (nlh->nlmsg_type == RTM_NEWADDR)
+		err = phonet_address_add(dev, pnaddr);
+	else
+		err = phonet_address_del(dev, pnaddr);
 	if (!err)
-		rtmsg_notify(RTM_DELADDR, dev, pnaddr);
+		rtmsg_notify(nlh->nlmsg_type, dev, pnaddr);
 	return err;
 }
 
@@ -121,25 +102,23 @@ static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr *nlh;
-	unsigned int orig_len = skb->len;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct ifaddrmsg));
-	ifm = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
 	ifm->ifa_family = AF_PHONET;
 	ifm->ifa_prefixlen = 0;
 	ifm->ifa_flags = IFA_F_PERMANENT;
-	ifm->ifa_scope = RT_SCOPE_HOST;
+	ifm->ifa_scope = RT_SCOPE_LINK;
 	ifm->ifa_index = dev->ifindex;
-	RTA_PUT(skb, IFA_LOCAL, 1, &addr);
-	nlh->nlmsg_len = skb->len - orig_len;
-
-	return 0;
-
-nlmsg_failure:
-rtattr_failure:
-	skb_trim(skb, orig_len);
+	NLA_PUT_U8(skb, IFA_LOCAL, addr);
+	return nlmsg_end(skb, nlh);
 
-	return -1;
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
 }
 
 static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
@@ -180,7 +159,7 @@ out:
 
 void __init phonet_netlink_register(void)
 {
-	rtnl_register(PF_PHONET, RTM_NEWADDR, newaddr_doit, NULL);
-	rtnl_register(PF_PHONET, RTM_DELADDR, deladdr_doit, NULL);
+	rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, NULL);
+	rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL);
 	rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit);
 }
-- 
cgit v1.2.3


From b4a4bf5d77c7d32098a7080f34a8857dd7fa466d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 26 Sep 2008 13:34:54 +0200
Subject: mac80211: fixups for "make master iface not wireless"

In "mac80211: make master iface not wireless" I accidentally
forgot to include these changes ... leading to the expected
BUG_ON errors.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/iface.c | 9 ++++-----
 net/mac80211/wme.c   | 8 ++++----
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b5cd91e8971..8336fee68d3 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -58,8 +58,9 @@ static inline int identical_mac_addr_allowed(int type1, int type2)
 
 static int ieee80211_open(struct net_device *dev)
 {
-	struct ieee80211_sub_if_data *sdata, *nsdata;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_sub_if_data *nsdata;
+	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	struct ieee80211_if_init_conf conf;
 	u32 changed = 0;
@@ -67,8 +68,6 @@ static int ieee80211_open(struct net_device *dev)
 	bool need_hw_reconfig = 0;
 	u8 null_addr[ETH_ALEN] = {0};
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
 	/* fail early if user set an invalid address */
 	if (compare_ether_addr(dev->dev_addr, null_addr) &&
 	    !is_valid_ether_addr(dev->dev_addr))
@@ -512,8 +511,8 @@ static int ieee80211_stop(struct net_device *dev)
 
 static void ieee80211_set_multicast_list(struct net_device *dev)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
 	int allmulti, promisc, sdata_allmulti, sdata_promisc;
 
 	allmulti = !!(dev->flags & IFF_ALLMULTI);
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index c703f8b44e9..139b5f267b3 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -73,9 +73,8 @@ static int wme_downgrade_ac(struct sk_buff *skb)
 
 
 /* Indicate which queue to use.  */
-static u16 classify80211(struct sk_buff *skb, struct net_device *dev)
+static u16 classify80211(struct ieee80211_local *local, struct sk_buff *skb)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 
 	if (!ieee80211_is_data(hdr->frame_control)) {
@@ -113,14 +112,15 @@ static u16 classify80211(struct sk_buff *skb, struct net_device *dev)
 
 u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 {
+	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
+	struct ieee80211_local *local = mpriv->local;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct sta_info *sta;
 	u16 queue;
 	u8 tid;
 
-	queue = classify80211(skb, dev);
+	queue = classify80211(local, skb);
 	if (unlikely(queue >= local->hw.queues))
 		queue = local->hw.queues - 1;
 
-- 
cgit v1.2.3


From d88410a0b657c5ccebd1c120af1f14c5ca6a3d95 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Thu, 25 Sep 2008 20:45:01 +0300
Subject: mac80211: remove wme_tx_queue and wme_rx_queue from
 net/mac80211/sta_info.h

This patch removes wme_tx_queue and wme_rx_queue from struct sta_info
and from the debugfs sub-structure of struct sta_info
in net/mac80211/sta_info.h, as they are useless and not used.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/sta_info.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index c3f43696462..a6b51862a89 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -189,7 +189,6 @@ struct sta_ampdu_mlme {
  * @last_qual: qual of last received frame from this STA
  * @last_noise: noise of last received frame from this STA
  * @last_seq_ctrl: last received seq/frag number from this STA (per RX queue)
- * @wme_rx_queue: TBD
  * @tx_filtered_count: TBD
  * @tx_retry_failed: TBD
  * @tx_retry_count: TBD
@@ -199,7 +198,6 @@ struct sta_ampdu_mlme {
  * @tx_fragments: number of transmitted MPDUs
  * @last_txrate_idx: Index of the last used transmit rate
  * @tid_seq: TBD
- * @wme_tx_queue: TBD
  * @ampdu_mlme: TBD
  * @timer_to_tid: identity mapping to ID timers
  * @tid_to_tx_q: map tid to tx queue
@@ -258,9 +256,6 @@ struct sta_info {
 	int last_qual;
 	int last_noise;
 	__le16 last_seq_ctrl[NUM_RX_DATA_QUEUES];
-#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
-	unsigned int wme_rx_queue[NUM_RX_DATA_QUEUES];
-#endif
 
 	/* Updated from TX status path only, no locking requirements */
 	unsigned long tx_filtered_count;
@@ -274,9 +269,6 @@ struct sta_info {
 	unsigned long tx_fragments;
 	unsigned int last_txrate_idx;
 	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
-#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
-	unsigned int wme_tx_queue[NUM_RX_DATA_QUEUES];
-#endif
 
 	/*
 	 * Aggregation information, locked with lock.
@@ -307,10 +299,6 @@ struct sta_info {
 		struct dentry *num_ps_buf_frames;
 		struct dentry *inactive_ms;
 		struct dentry *last_seq_ctrl;
-#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
-		struct dentry *wme_rx_queue;
-		struct dentry *wme_tx_queue;
-#endif
 		struct dentry *agg_status;
 	} debugfs;
 #endif
-- 
cgit v1.2.3


From b0dee5784dff3e2984510a7fe7a7e48109001f94 Mon Sep 17 00:00:00 2001
From: Davide Pesavento <davidepesa@gmail.com>
Date: Sat, 27 Sep 2008 17:29:12 +0200
Subject: Fix modpost failure when rx handlers are not inlined.

When CONFIG_MAC80211_MESH=n and CONFIG_MAC80211_NOINLINE=y,
gcc doesn't optimize out a call to ieee80211_rx_h_mesh_fwding,
even if the previous comparison is always false in this case.
This leads to the following errors during modpost:

ERROR: "mpp_path_lookup" [net/mac80211/mac80211.ko] undefined!
ERROR: "mpp_path_add" [net/mac80211/mac80211.ko] undefined!

Fix by removing the possibility of uninlining
ieee80211_rx_h_mesh_fwding rx handler.

Signed-off-by: Davide Pesavento <davidepesa@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index c489865761b..77e7b014872 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1379,7 +1379,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 	return RX_QUEUED;
 }
 
-static ieee80211_rx_result debug_noinline
+static ieee80211_rx_result
 ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_hdr *hdr;
-- 
cgit v1.2.3


From 04a4bb55bcf35b63d40fd2725e58599ff8310dd7 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@marvell.com>
Date: Wed, 1 Oct 2008 02:33:12 -0700
Subject: net: add skb_recycle_check() to enable netdriver skb recycling

This patch adds skb_recycle_check(), which can be used by a network
driver after transmitting an skb to check whether this skb can be
recycled as a receive buffer.

skb_recycle_check() checks that the skb is not shared or cloned, and
that it is linear and its head portion large enough (as determined by
the driver) to be recycled as a receive buffer.  If these conditions
are met, it does any necessary reference count dropping and cleans
up the skbuff as if it just came from __alloc_skb().

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ca1ccdf1ef7..2c218a0808b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -363,8 +363,7 @@ static void kfree_skbmem(struct sk_buff *skb)
 	}
 }
 
-/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb)
+static void skb_release_head_state(struct sk_buff *skb)
 {
 	dst_release(skb->dst);
 #ifdef CONFIG_XFRM
@@ -388,6 +387,12 @@ static void skb_release_all(struct sk_buff *skb)
 	skb->tc_verd = 0;
 #endif
 #endif
+}
+
+/* Free everything but the sk_buff shell. */
+static void skb_release_all(struct sk_buff *skb)
+{
+	skb_release_head_state(skb);
 	skb_release_data(skb);
 }
 
@@ -424,6 +429,38 @@ void kfree_skb(struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
+int skb_recycle_check(struct sk_buff *skb, int skb_size)
+{
+	struct skb_shared_info *shinfo;
+
+	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
+		return 0;
+
+	skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD);
+	if (skb_end_pointer(skb) - skb->head < skb_size)
+		return 0;
+
+	if (skb_shared(skb) || skb_cloned(skb))
+		return 0;
+
+	skb_release_head_state(skb);
+	shinfo = skb_shinfo(skb);
+	atomic_set(&shinfo->dataref, 1);
+	shinfo->nr_frags = 0;
+	shinfo->gso_size = 0;
+	shinfo->gso_segs = 0;
+	shinfo->gso_type = 0;
+	shinfo->ip6_frag_id = 0;
+	shinfo->frag_list = NULL;
+
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb_reset_tail_pointer(skb);
+	skb->data = skb->head + NET_SKB_PAD;
+
+	return 1;
+}
+EXPORT_SYMBOL(skb_recycle_check);
+
 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 {
 	new->tstamp		= old->tstamp;
-- 
cgit v1.2.3


From 93c8b90f01f0dc73891da4e84b26524b61d29d66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Wed, 1 Oct 2008 02:48:31 -0700
Subject: ipv6: almost identical frag hashing funcs combined
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

$ diff-funcs ip6qhashfn reassembly.c netfilter/nf_conntrack_reasm.c
 --- reassembly.c:ip6qhashfn()
 +++ netfilter/nf_conntrack_reasm.c:ip6qhashfn()
@@ -1,5 +1,5 @@
-static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr,
-			       struct in6_addr *daddr)
+static unsigned int ip6qhashfn(__be32 id, const struct in6_addr *saddr,
+			       const struct in6_addr *daddr)
 {
 	u32 a, b, c;

@@ -9,7 +9,7 @@

 	a += JHASH_GOLDEN_RATIO;
 	b += JHASH_GOLDEN_RATIO;
-	c += ip6_frags.rnd;
+	c += nf_frags.rnd;
 	__jhash_mix(a, b, c);

 	a += (__force u32)saddr->s6_addr32[3];

And codiff xx.o.old xx.o.new:

net/ipv6/netfilter/nf_conntrack_reasm.c:
  ip6qhashfn         | -512
  nf_hashfn          |   +6
  nf_ct_frag6_gather |  +36
 3 functions changed, 42 bytes added, 512 bytes removed, diff: -470
net/ipv6/reassembly.c:
  ip6qhashfn    | -512
  ip6_hashfn    |   +7
  ipv6_frag_rcv |  +89
 3 functions changed, 96 bytes added, 512 bytes removed, diff: -416

net/ipv6/reassembly.c:
  inet6_hash_frag | +510
 1 function changed, 510 bytes added, diff: +510

Total: -376

Compile tested.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 32 ++------------------------------
 net/ipv6/reassembly.c                   | 11 ++++++-----
 2 files changed, 8 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 52d06dd4b81..9967ac7a01a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -27,7 +27,6 @@
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 #include <linux/random.h>
-#include <linux/jhash.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -103,39 +102,12 @@ struct ctl_table nf_ct_ipv6_sysctl_table[] = {
 };
 #endif
 
-static unsigned int ip6qhashfn(__be32 id, const struct in6_addr *saddr,
-			       const struct in6_addr *daddr)
-{
-	u32 a, b, c;
-
-	a = (__force u32)saddr->s6_addr32[0];
-	b = (__force u32)saddr->s6_addr32[1];
-	c = (__force u32)saddr->s6_addr32[2];
-
-	a += JHASH_GOLDEN_RATIO;
-	b += JHASH_GOLDEN_RATIO;
-	c += nf_frags.rnd;
-	__jhash_mix(a, b, c);
-
-	a += (__force u32)saddr->s6_addr32[3];
-	b += (__force u32)daddr->s6_addr32[0];
-	c += (__force u32)daddr->s6_addr32[1];
-	__jhash_mix(a, b, c);
-
-	a += (__force u32)daddr->s6_addr32[2];
-	b += (__force u32)daddr->s6_addr32[3];
-	c += (__force u32)id;
-	__jhash_mix(a, b, c);
-
-	return c & (INETFRAGS_HASHSZ - 1);
-}
-
 static unsigned int nf_hashfn(struct inet_frag_queue *q)
 {
 	const struct nf_ct_frag6_queue *nq;
 
 	nq = container_of(q, struct nf_ct_frag6_queue, q);
-	return ip6qhashfn(nq->id, &nq->saddr, &nq->daddr);
+	return inet6_hash_frag(nq->id, &nq->saddr, &nq->daddr, nf_frags.rnd);
 }
 
 static void nf_skb_free(struct sk_buff *skb)
@@ -209,7 +181,7 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst)
 	arg.dst = dst;
 
 	read_lock_bh(&nf_frags.lock);
-	hash = ip6qhashfn(id, src, dst);
+	hash = inet6_hash_frag(id, src, dst, nf_frags.rnd);
 
 	q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash);
 	local_bh_enable();
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 89184b576e2..2eeadfa039c 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -99,8 +99,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
  * callers should be careful not to use the hash value outside the ipfrag_lock
  * as doing so could race with ipfrag_hash_rnd being recalculated.
  */
-static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr,
-			       struct in6_addr *daddr)
+unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
+			     const struct in6_addr *daddr, u32 rnd)
 {
 	u32 a, b, c;
 
@@ -110,7 +110,7 @@ static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr,
 
 	a += JHASH_GOLDEN_RATIO;
 	b += JHASH_GOLDEN_RATIO;
-	c += ip6_frags.rnd;
+	c += rnd;
 	__jhash_mix(a, b, c);
 
 	a += (__force u32)saddr->s6_addr32[3];
@@ -125,13 +125,14 @@ static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr,
 
 	return c & (INETFRAGS_HASHSZ - 1);
 }
+EXPORT_SYMBOL_GPL(inet6_hash_frag);
 
 static unsigned int ip6_hashfn(struct inet_frag_queue *q)
 {
 	struct frag_queue *fq;
 
 	fq = container_of(q, struct frag_queue, q);
-	return ip6qhashfn(fq->id, &fq->saddr, &fq->daddr);
+	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr, ip6_frags.rnd);
 }
 
 int ip6_frag_match(struct inet_frag_queue *q, void *a)
@@ -247,7 +248,7 @@ fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst,
 	arg.dst = dst;
 
 	read_lock(&ip6_frags.lock);
-	hash = ip6qhashfn(id, src, dst);
+	hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd);
 
 	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
 	if (q == NULL)
-- 
cgit v1.2.3


From 12a169e7d8f4b1c95252d8b04ed0f1033ed7cfe2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 1 Oct 2008 07:03:24 -0700
Subject: ipsec: Put dumpers on the dump list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Herbert Xu came up with the idea and the original patch to make
xfrm_state dump list contain also dumpers:

As it is we go to extraordinary lengths to ensure that states
don't go away while dumpers go to sleep.  It's much easier if
we just put the dumpers themselves on the list since they can't
go away while they're going.

I've also changed the order of addition on new states to prevent
a never-ending dump.

Timo Teräs improved the patch to apply cleanly to latest tree,
modified iteration code to be more readable by using a common
struct for entries in the list, implemented the same idea for
xfrm_policy dumping and moved the af_key specific "last" entry
caching to af_key.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Timo Teras <timo.teras@iki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c       |  38 ++++++++++++++---
 net/xfrm/xfrm_policy.c | 111 ++++++++++++++++++++++++++-----------------------
 net/xfrm/xfrm_state.c  | 109 ++++++++++++++++--------------------------------
 net/xfrm/xfrm_user.c   |   4 +-
 4 files changed, 130 insertions(+), 132 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index b7f5a1c353e..7ae641df70b 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -58,6 +58,7 @@ struct pfkey_sock {
 			struct xfrm_policy_walk	policy;
 			struct xfrm_state_walk	state;
 		} u;
+		struct sk_buff	*skb;
 	} dump;
 };
 
@@ -76,6 +77,10 @@ static int pfkey_can_dump(struct sock *sk)
 static void pfkey_terminate_dump(struct pfkey_sock *pfk)
 {
 	if (pfk->dump.dump) {
+		if (pfk->dump.skb) {
+			kfree_skb(pfk->dump.skb);
+			pfk->dump.skb = NULL;
+		}
 		pfk->dump.done(pfk);
 		pfk->dump.dump = NULL;
 		pfk->dump.done = NULL;
@@ -308,12 +313,25 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
 
 static int pfkey_do_dump(struct pfkey_sock *pfk)
 {
+	struct sadb_msg *hdr;
 	int rc;
 
 	rc = pfk->dump.dump(pfk);
 	if (rc == -ENOBUFS)
 		return 0;
 
+	if (pfk->dump.skb) {
+		if (!pfkey_can_dump(&pfk->sk))
+			return 0;
+
+		hdr = (struct sadb_msg *) pfk->dump.skb->data;
+		hdr->sadb_msg_seq = 0;
+		hdr->sadb_msg_errno = rc;
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+				&pfk->sk);
+		pfk->dump.skb = NULL;
+	}
+
 	pfkey_terminate_dump(pfk);
 	return rc;
 }
@@ -1744,9 +1762,14 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr)
 	out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
 	out_hdr->sadb_msg_errno = 0;
 	out_hdr->sadb_msg_reserved = 0;
-	out_hdr->sadb_msg_seq = count;
+	out_hdr->sadb_msg_seq = count + 1;
 	out_hdr->sadb_msg_pid = pfk->dump.msg_pid;
-	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk);
+
+	if (pfk->dump.skb)
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+				&pfk->sk);
+	pfk->dump.skb = out_skb;
+
 	return 0;
 }
 
@@ -2245,7 +2268,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	return 0;
 
 out:
-	xp->dead = 1;
+	xp->walk.dead = 1;
 	xfrm_policy_destroy(xp);
 	return err;
 }
@@ -2583,9 +2606,14 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
 	out_hdr->sadb_msg_type = SADB_X_SPDDUMP;
 	out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
 	out_hdr->sadb_msg_errno = 0;
-	out_hdr->sadb_msg_seq = count;
+	out_hdr->sadb_msg_seq = count + 1;
 	out_hdr->sadb_msg_pid = pfk->dump.msg_pid;
-	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk);
+
+	if (pfk->dump.skb)
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+				&pfk->sk);
+	pfk->dump.skb = out_skb;
+
 	return 0;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ef9ccbc3875..b7ec08025ff 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,7 +46,7 @@ EXPORT_SYMBOL(xfrm_cfg_mutex);
 
 static DEFINE_RWLOCK(xfrm_policy_lock);
 
-static struct list_head xfrm_policy_bytype[XFRM_POLICY_TYPE_MAX];
+static struct list_head xfrm_policy_all;
 unsigned int xfrm_policy_count[XFRM_POLICY_MAX*2];
 EXPORT_SYMBOL(xfrm_policy_count);
 
@@ -164,7 +164,7 @@ static void xfrm_policy_timer(unsigned long data)
 
 	read_lock(&xp->lock);
 
-	if (xp->dead)
+	if (xp->walk.dead)
 		goto out;
 
 	dir = xfrm_policy_id2dir(xp->index);
@@ -236,7 +236,7 @@ struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp)
 	policy = kzalloc(sizeof(struct xfrm_policy), gfp);
 
 	if (policy) {
-		INIT_LIST_HEAD(&policy->bytype);
+		INIT_LIST_HEAD(&policy->walk.all);
 		INIT_HLIST_NODE(&policy->bydst);
 		INIT_HLIST_NODE(&policy->byidx);
 		rwlock_init(&policy->lock);
@@ -252,17 +252,13 @@ EXPORT_SYMBOL(xfrm_policy_alloc);
 
 void xfrm_policy_destroy(struct xfrm_policy *policy)
 {
-	BUG_ON(!policy->dead);
+	BUG_ON(!policy->walk.dead);
 
 	BUG_ON(policy->bundles);
 
 	if (del_timer(&policy->timer))
 		BUG();
 
-	write_lock_bh(&xfrm_policy_lock);
-	list_del(&policy->bytype);
-	write_unlock_bh(&xfrm_policy_lock);
-
 	security_xfrm_policy_free(policy->security);
 	kfree(policy);
 }
@@ -310,8 +306,8 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 	int dead;
 
 	write_lock_bh(&policy->lock);
-	dead = policy->dead;
-	policy->dead = 1;
+	dead = policy->walk.dead;
+	policy->walk.dead = 1;
 	write_unlock_bh(&policy->lock);
 
 	if (unlikely(dead)) {
@@ -609,6 +605,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	if (delpol) {
 		hlist_del(&delpol->bydst);
 		hlist_del(&delpol->byidx);
+		list_del(&delpol->walk.all);
 		xfrm_policy_count[dir]--;
 	}
 	policy->index = delpol ? delpol->index : xfrm_gen_index(policy->type, dir);
@@ -617,7 +614,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	policy->curlft.use_time = 0;
 	if (!mod_timer(&policy->timer, jiffies + HZ))
 		xfrm_pol_hold(policy);
-	list_add_tail(&policy->bytype, &xfrm_policy_bytype[policy->type]);
+	list_add(&policy->walk.all, &xfrm_policy_all);
 	write_unlock_bh(&xfrm_policy_lock);
 
 	if (delpol)
@@ -684,6 +681,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
 				}
 				hlist_del(&pol->bydst);
 				hlist_del(&pol->byidx);
+				list_del(&pol->walk.all);
 				xfrm_policy_count[dir]--;
 			}
 			ret = pol;
@@ -727,6 +725,7 @@ struct xfrm_policy *xfrm_policy_byid(u8 type, int dir, u32 id, int delete,
 				}
 				hlist_del(&pol->bydst);
 				hlist_del(&pol->byidx);
+				list_del(&pol->walk.all);
 				xfrm_policy_count[dir]--;
 			}
 			ret = pol;
@@ -840,6 +839,7 @@ int xfrm_policy_flush(u8 type, struct xfrm_audit *audit_info)
 					continue;
 				hlist_del(&pol->bydst);
 				hlist_del(&pol->byidx);
+				list_del(&pol->walk.all);
 				write_unlock_bh(&xfrm_policy_lock);
 
 				xfrm_audit_policy_delete(pol, 1,
@@ -867,60 +867,68 @@ int xfrm_policy_walk(struct xfrm_policy_walk *walk,
 		     int (*func)(struct xfrm_policy *, int, int, void*),
 		     void *data)
 {
-	struct xfrm_policy *old, *pol, *last = NULL;
+	struct xfrm_policy *pol;
+	struct xfrm_policy_walk_entry *x;
 	int error = 0;
 
 	if (walk->type >= XFRM_POLICY_TYPE_MAX &&
 	    walk->type != XFRM_POLICY_TYPE_ANY)
 		return -EINVAL;
 
-	if (walk->policy == NULL && walk->count != 0)
+	if (list_empty(&walk->walk.all) && walk->seq != 0)
 		return 0;
 
-	old = pol = walk->policy;
-	walk->policy = NULL;
-	read_lock_bh(&xfrm_policy_lock);
-
-	for (; walk->cur_type < XFRM_POLICY_TYPE_MAX; walk->cur_type++) {
-		if (walk->type != walk->cur_type &&
-		    walk->type != XFRM_POLICY_TYPE_ANY)
+	write_lock_bh(&xfrm_policy_lock);
+	if (list_empty(&walk->walk.all))
+		x = list_first_entry(&xfrm_policy_all, struct xfrm_policy_walk_entry, all);
+	else
+		x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all);
+	list_for_each_entry_from(x, &xfrm_policy_all, all) {
+		if (x->dead)
 			continue;
-
-		if (pol == NULL) {
-			pol = list_first_entry(&xfrm_policy_bytype[walk->cur_type],
-					       struct xfrm_policy, bytype);
-		}
-		list_for_each_entry_from(pol, &xfrm_policy_bytype[walk->cur_type], bytype) {
-			if (pol->dead)
-				continue;
-			if (last) {
-				error = func(last, xfrm_policy_id2dir(last->index),
-					     walk->count, data);
-				if (error) {
-					xfrm_pol_hold(last);
-					walk->policy = last;
-					goto out;
-				}
-			}
-			last = pol;
-			walk->count++;
+		pol = container_of(x, struct xfrm_policy, walk);
+		if (walk->type != XFRM_POLICY_TYPE_ANY &&
+		    walk->type != pol->type)
+			continue;
+		error = func(pol, xfrm_policy_id2dir(pol->index),
+			     walk->seq, data);
+		if (error) {
+			list_move_tail(&walk->walk.all, &x->all);
+			goto out;
 		}
-		pol = NULL;
+		walk->seq++;
 	}
-	if (walk->count == 0) {
+	if (walk->seq == 0) {
 		error = -ENOENT;
 		goto out;
 	}
-	if (last)
-		error = func(last, xfrm_policy_id2dir(last->index), 0, data);
+	list_del_init(&walk->walk.all);
 out:
-	read_unlock_bh(&xfrm_policy_lock);
-	if (old != NULL)
-		xfrm_pol_put(old);
+	write_unlock_bh(&xfrm_policy_lock);
 	return error;
 }
 EXPORT_SYMBOL(xfrm_policy_walk);
 
+void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
+{
+	INIT_LIST_HEAD(&walk->walk.all);
+	walk->walk.dead = 1;
+	walk->type = type;
+	walk->seq = 0;
+}
+EXPORT_SYMBOL(xfrm_policy_walk_init);
+
+void xfrm_policy_walk_done(struct xfrm_policy_walk *walk)
+{
+	if (list_empty(&walk->walk.all))
+		return;
+
+	write_lock_bh(&xfrm_policy_lock);
+	list_del(&walk->walk.all);
+	write_unlock_bh(&xfrm_policy_lock);
+}
+EXPORT_SYMBOL(xfrm_policy_walk_done);
+
 /*
  * Find policy to apply to this flow.
  *
@@ -1077,7 +1085,7 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
 	struct hlist_head *chain = policy_hash_bysel(&pol->selector,
 						     pol->family, dir);
 
-	list_add_tail(&pol->bytype, &xfrm_policy_bytype[pol->type]);
+	list_add(&pol->walk.all, &xfrm_policy_all);
 	hlist_add_head(&pol->bydst, chain);
 	hlist_add_head(&pol->byidx, xfrm_policy_byidx+idx_hash(pol->index));
 	xfrm_policy_count[dir]++;
@@ -1095,6 +1103,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 
 	hlist_del(&pol->bydst);
 	hlist_del(&pol->byidx);
+	list_del(&pol->walk.all);
 	xfrm_policy_count[dir]--;
 
 	return pol;
@@ -1720,7 +1729,7 @@ restart:
 
 		for (pi = 0; pi < npols; pi++) {
 			read_lock_bh(&pols[pi]->lock);
-			pol_dead |= pols[pi]->dead;
+			pol_dead |= pols[pi]->walk.dead;
 			read_unlock_bh(&pols[pi]->lock);
 		}
 
@@ -2415,9 +2424,7 @@ static void __init xfrm_policy_init(void)
 			panic("XFRM: failed to allocate bydst hash\n");
 	}
 
-	for (dir = 0; dir < XFRM_POLICY_TYPE_MAX; dir++)
-		INIT_LIST_HEAD(&xfrm_policy_bytype[dir]);
-
+	INIT_LIST_HEAD(&xfrm_policy_all);
 	INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task);
 	register_netdevice_notifier(&xfrm_dev_notifier);
 }
@@ -2601,7 +2608,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
-	if (unlikely(pol->dead)) {
+	if (unlikely(pol->walk.dead)) {
 		/* target policy has been deleted */
 		write_unlock_bh(&pol->lock);
 		return -ENOENT;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 053970e8765..747fd8c291a 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -59,14 +59,6 @@ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static unsigned int xfrm_state_num;
 static unsigned int xfrm_state_genid;
 
-/* Counter indicating ongoing walk, protected by xfrm_state_lock. */
-static unsigned long xfrm_state_walk_ongoing;
-/* Counter indicating walk completion, protected by xfrm_cfg_mutex. */
-static unsigned long xfrm_state_walk_completed;
-
-/* List of outstanding state walks used to set the completed counter.  */
-static LIST_HEAD(xfrm_state_walks);
-
 static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
 
@@ -199,8 +191,7 @@ static DEFINE_RWLOCK(xfrm_state_afinfo_lock);
 static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO];
 
 static struct work_struct xfrm_state_gc_work;
-static LIST_HEAD(xfrm_state_gc_leftovers);
-static LIST_HEAD(xfrm_state_gc_list);
+static HLIST_HEAD(xfrm_state_gc_list);
 static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 
 int __xfrm_state_delete(struct xfrm_state *x);
@@ -412,23 +403,16 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 
 static void xfrm_state_gc_task(struct work_struct *data)
 {
-	struct xfrm_state *x, *tmp;
-	unsigned long completed;
+	struct xfrm_state *x;
+	struct hlist_node *entry, *tmp;
+	struct hlist_head gc_list;
 
-	mutex_lock(&xfrm_cfg_mutex);
 	spin_lock_bh(&xfrm_state_gc_lock);
-	list_splice_tail_init(&xfrm_state_gc_list, &xfrm_state_gc_leftovers);
+	hlist_move_list(&xfrm_state_gc_list, &gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 
-	completed = xfrm_state_walk_completed;
-	mutex_unlock(&xfrm_cfg_mutex);
-
-	list_for_each_entry_safe(x, tmp, &xfrm_state_gc_leftovers, gclist) {
-		if ((long)(x->lastused - completed) > 0)
-			break;
-		list_del(&x->gclist);
+	hlist_for_each_entry_safe(x, entry, tmp, &gc_list, gclist)
 		xfrm_state_gc_destroy(x);
-	}
 
 	wake_up(&km_waitq);
 }
@@ -529,7 +513,7 @@ struct xfrm_state *xfrm_state_alloc(void)
 	if (x) {
 		atomic_set(&x->refcnt, 1);
 		atomic_set(&x->tunnel_users, 0);
-		INIT_LIST_HEAD(&x->all);
+		INIT_LIST_HEAD(&x->km.all);
 		INIT_HLIST_NODE(&x->bydst);
 		INIT_HLIST_NODE(&x->bysrc);
 		INIT_HLIST_NODE(&x->byspi);
@@ -556,7 +540,7 @@ void __xfrm_state_destroy(struct xfrm_state *x)
 	WARN_ON(x->km.state != XFRM_STATE_DEAD);
 
 	spin_lock_bh(&xfrm_state_gc_lock);
-	list_add_tail(&x->gclist, &xfrm_state_gc_list);
+	hlist_add_head(&x->gclist, &xfrm_state_gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 	schedule_work(&xfrm_state_gc_work);
 }
@@ -569,8 +553,7 @@ int __xfrm_state_delete(struct xfrm_state *x)
 	if (x->km.state != XFRM_STATE_DEAD) {
 		x->km.state = XFRM_STATE_DEAD;
 		spin_lock(&xfrm_state_lock);
-		x->lastused = xfrm_state_walk_ongoing;
-		list_del_rcu(&x->all);
+		list_del(&x->km.all);
 		hlist_del(&x->bydst);
 		hlist_del(&x->bysrc);
 		if (x->id.spi)
@@ -871,7 +854,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 
 		if (km_query(x, tmpl, pol) == 0) {
 			x->km.state = XFRM_STATE_ACQ;
-			list_add_tail(&x->all, &xfrm_state_all);
+			list_add(&x->km.all, &xfrm_state_all);
 			hlist_add_head(&x->bydst, xfrm_state_bydst+h);
 			h = xfrm_src_hash(daddr, saddr, family);
 			hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
@@ -940,7 +923,7 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 
 	x->genid = ++xfrm_state_genid;
 
-	list_add_tail(&x->all, &xfrm_state_all);
+	list_add(&x->km.all, &xfrm_state_all);
 
 	h = xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
 			  x->props.reqid, x->props.family);
@@ -1069,7 +1052,7 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re
 		xfrm_state_hold(x);
 		x->timer.expires = jiffies + sysctl_xfrm_acq_expires*HZ;
 		add_timer(&x->timer);
-		list_add_tail(&x->all, &xfrm_state_all);
+		list_add(&x->km.all, &xfrm_state_all);
 		hlist_add_head(&x->bydst, xfrm_state_bydst+h);
 		h = xfrm_src_hash(daddr, saddr, family);
 		hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
@@ -1566,79 +1549,59 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
 		    int (*func)(struct xfrm_state *, int, void*),
 		    void *data)
 {
-	struct xfrm_state *old, *x, *last = NULL;
+	struct xfrm_state *state;
+	struct xfrm_state_walk *x;
 	int err = 0;
 
-	if (walk->state == NULL && walk->count != 0)
+	if (walk->seq != 0 && list_empty(&walk->all))
 		return 0;
 
-	old = x = walk->state;
-	walk->state = NULL;
 	spin_lock_bh(&xfrm_state_lock);
-	if (x == NULL)
-		x = list_first_entry(&xfrm_state_all, struct xfrm_state, all);
+	if (list_empty(&walk->all))
+		x = list_first_entry(&xfrm_state_all, struct xfrm_state_walk, all);
+	else
+		x = list_entry(&walk->all, struct xfrm_state_walk, all);
 	list_for_each_entry_from(x, &xfrm_state_all, all) {
-		if (x->km.state == XFRM_STATE_DEAD)
+		if (x->state == XFRM_STATE_DEAD)
 			continue;
-		if (!xfrm_id_proto_match(x->id.proto, walk->proto))
+		state = container_of(x, struct xfrm_state, km);
+		if (!xfrm_id_proto_match(state->id.proto, walk->proto))
 			continue;
-		if (last) {
-			err = func(last, walk->count, data);
-			if (err) {
-				xfrm_state_hold(last);
-				walk->state = last;
-				goto out;
-			}
+		err = func(state, walk->seq, data);
+		if (err) {
+			list_move_tail(&walk->all, &x->all);
+			goto out;
 		}
-		last = x;
-		walk->count++;
+		walk->seq++;
 	}
-	if (walk->count == 0) {
+	if (walk->seq == 0) {
 		err = -ENOENT;
 		goto out;
 	}
-	if (last)
-		err = func(last, 0, data);
+	list_del_init(&walk->all);
 out:
 	spin_unlock_bh(&xfrm_state_lock);
-	if (old != NULL)
-		xfrm_state_put(old);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_state_walk);
 
 void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
 {
+	INIT_LIST_HEAD(&walk->all);
 	walk->proto = proto;
-	walk->state = NULL;
-	walk->count = 0;
-	list_add_tail(&walk->list, &xfrm_state_walks);
-	walk->genid = ++xfrm_state_walk_ongoing;
+	walk->state = XFRM_STATE_DEAD;
+	walk->seq = 0;
 }
 EXPORT_SYMBOL(xfrm_state_walk_init);
 
 void xfrm_state_walk_done(struct xfrm_state_walk *walk)
 {
-	struct list_head *prev;
-
-	if (walk->state != NULL) {
-		xfrm_state_put(walk->state);
-		walk->state = NULL;
-	}
-
-	prev = walk->list.prev;
-	list_del(&walk->list);
-
-	if (prev != &xfrm_state_walks) {
-		list_entry(prev, struct xfrm_state_walk, list)->genid =
-			walk->genid;
+	if (list_empty(&walk->all))
 		return;
-	}
-
-	xfrm_state_walk_completed = walk->genid;
 
-	if (!list_empty(&xfrm_state_gc_leftovers))
-		schedule_work(&xfrm_state_gc_work);
+	spin_lock_bh(&xfrm_state_lock);
+	list_del(&walk->all);
+	spin_lock_bh(&xfrm_state_lock);
 }
 EXPORT_SYMBOL(xfrm_state_walk_done);
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 04c41504f84..76f75df21e1 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1102,7 +1102,7 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
 	return xp;
  error:
 	*errp = err;
-	xp->dead = 1;
+	xp->walk.dead = 1;
 	xfrm_policy_destroy(xp);
 	return NULL;
 }
@@ -1595,7 +1595,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENOENT;
 
 	read_lock(&xp->lock);
-	if (xp->dead) {
+	if (xp->walk.dead) {
 		read_unlock(&xp->lock);
 		goto out;
 	}
-- 
cgit v1.2.3


From 4edd87ad5cad8e159e0db3ce3131b3d97219c9cd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 1 Oct 2008 07:09:38 -0700
Subject: net: BUG instead of corrupting memory in pskb_expand_head

If the caller of pskb_expand_head specifies a negative nhead
we'll silently overwrite other people's memory.  This patch
makes it BUG instead.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2c218a0808b..8bd248a6487 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -738,6 +738,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 #endif
 	long off;
 
+	BUG_ON(nhead < 0);
+
 	if (skb_shared(skb))
 		BUG();
 
-- 
cgit v1.2.3


From a210d01ae3ee006b59e54e772a7f212486e0f021 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 1 Oct 2008 07:28:28 -0700
Subject: ipv4: Loosen source address check on IPv4 output

ip_route_output() contains a check to make sure that no flows with
non-local source IP addresses are routed. This obviously makes using
such addresses impossible.

This patch introduces a flowi flag which makes omitting this check
possible. The new flag provides a way of handling transparent and
non-transparent connections differently.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f62187bb6d0..a6d7c584f53 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2361,11 +2361,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 		    ipv4_is_zeronet(oldflp->fl4_src))
 			goto out;
 
-		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-		dev_out = ip_dev_find(net, oldflp->fl4_src);
-		if (dev_out == NULL)
-			goto out;
-
 		/* I removed check for oif == dev_out->oif here.
 		   It was wrong for two reasons:
 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
@@ -2377,6 +2372,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 		if (oldflp->oif == 0
 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+			dev_out = ip_dev_find(net, oldflp->fl4_src);
+			if (dev_out == NULL)
+				goto out;
+
 			/* Special hack: user can direct multicasts
 			   and limited broadcast via necessary interface
 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -2395,9 +2395,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			fl.oif = dev_out->ifindex;
 			goto make_route;
 		}
-		if (dev_out)
+
+		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+			dev_out = ip_dev_find(net, oldflp->fl4_src);
+			if (dev_out == NULL)
+				goto out;
 			dev_put(dev_out);
-		dev_out = NULL;
+			dev_out = NULL;
+		}
 	}
 
 
-- 
cgit v1.2.3


From f5715aea4564f233767ea1d944b2637a5fd7cd2e Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 1 Oct 2008 07:30:02 -0700
Subject: ipv4: Implement IP_TRANSPARENT socket option

This patch introduces the IP_TRANSPARENT socket option: enabling that
will make the IPv4 routing omit the non-local source address check on
output. Setting IP_TRANSPARENT requires NET_ADMIN capability.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_timewait_sock.c |  1 +
 net/ipv4/ip_sockglue.c        | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 743f011b9a8..1c5fd38f882 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -126,6 +126,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_reuse	    = sk->sk_reuse;
 		tw->tw_hash	    = sk->sk_hash;
 		tw->tw_ipv6only	    = 0;
+		tw->tw_transparent  = inet->transparent;
 		tw->tw_prot	    = sk->sk_prot_creator;
 		twsk_net_set(tw, hold_net(sock_net(sk)));
 		atomic_set(&tw->tw_refcnt, 1);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 105d92a039b..465abf0a986 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -419,7 +419,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			     (1<<IP_TTL) | (1<<IP_HDRINCL) |
 			     (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
 			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
-			     (1<<IP_PASSSEC))) ||
+			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
 	    optname == IP_MULTICAST_TTL ||
 	    optname == IP_MULTICAST_LOOP) {
 		if (optlen >= sizeof(int)) {
@@ -878,6 +878,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		err = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
 
+	case IP_TRANSPARENT:
+		if (!capable(CAP_NET_ADMIN)) {
+			err = -EPERM;
+			break;
+		}
+		if (optlen < 1)
+			goto e_inval;
+		inet->transparent = !!val;
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -1130,6 +1140,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_FREEBIND:
 		val = inet->freebind;
 		break;
+	case IP_TRANSPARENT:
+		val = inet->transparent;
+		break;
 	default:
 		release_sock(sk);
 		return -ENOPROTOOPT;
-- 
cgit v1.2.3


From b9fb15067ce93497bef852c05e406d7a96212a9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B3th=20L=C3=A1szl=C3=B3=20Attila?= <panther@balabit.hu>
Date: Wed, 1 Oct 2008 07:31:24 -0700
Subject: ipv4: Allow binding to non-local addresses if IP_TRANSPARENT is set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Setting IP_TRANSPARENT is not really useful without allowing non-local
binds for the socket. To make user-space code simpler we allow these
binds even if IP_TRANSPARENT is set but IP_FREEBIND is not.

Signed-off-by: Tóth László Attila <panther@balabit.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8a3ac1fa71a..1fbff5fa424 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -469,7 +469,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	 */
 	err = -EADDRNOTAVAIL;
 	if (!sysctl_ip_nonlocal_bind &&
-	    !inet->freebind &&
+	    !(inet->freebind || inet->transparent) &&
 	    addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
 	    chk_addr_ret != RTN_LOCAL &&
 	    chk_addr_ret != RTN_MULTICAST &&
-- 
cgit v1.2.3


From 1668e010cbe1a7567c81d4c02d31dde9859e9da1 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 1 Oct 2008 07:33:10 -0700
Subject: ipv4: Make inet_sock.h independent of route.h

inet_iif() in inet_sock.h requires route.h. Since users of inet_iif()
usually require other route.h functionality anyway this patch moves
inet_iif() to route.h.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_nat_helper.c | 1 +
 net/ipv6/af_inet6.c                | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 11976ea2988..112dcfa1290 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -16,6 +16,7 @@
 #include <linux/udp.h>
 #include <net/checksum.h>
 #include <net/tcp.h>
+#include <net/route.h>
 
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/nf_conntrack.h>
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 95055f8c3f3..f018704ecb8 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -50,6 +50,7 @@
 #include <net/ipip.h>
 #include <net/protocol.h>
 #include <net/inet_common.h>
+#include <net/route.h>
 #include <net/transp_v6.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
-- 
cgit v1.2.3


From 88ef4a5a78e63420dd1dd770f1bd1dc198926b04 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 1 Oct 2008 07:41:00 -0700
Subject: tcp: Handle TCP SYN+ACK/ACK/RST transparency

The TCP stack sends out SYN+ACK/ACK/RST reply packets in response to
incoming packets. The non-local source address check on output bites
us again, as replies for transparently redirected traffic won't have a
chance to leave the node.

This patch selectively sets the FLOWI_FLAG_ANYSRC flag when doing the
route lookup for those replies. Transparent replies are enabled if the
listening socket has the transparent socket flag set.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d13688e3558..8b24bd833cb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -591,6 +591,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 				      ip_hdr(skb)->saddr, /* XXX */
 				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 
 	net = dev_net(skb->dst->dev);
 	ip_send_reply(net->ipv4.tcp_sock, skb,
@@ -606,7 +607,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 
 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 			    u32 win, u32 ts, int oif,
-			    struct tcp_md5sig_key *key)
+			    struct tcp_md5sig_key *key,
+			    int reply_flags)
 {
 	struct tcphdr *th = tcp_hdr(skb);
 	struct {
@@ -659,6 +661,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 				    ip_hdr(skb)->daddr, &rep.th);
 	}
 #endif
+	arg.flags = reply_flags;
 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 				      ip_hdr(skb)->saddr, /* XXX */
 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
@@ -681,7 +684,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcptw->tw_ts_recent,
 			tw->tw_bound_dev_if,
-			tcp_twsk_md5_key(tcptw)
+			tcp_twsk_md5_key(tcptw),
+			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 			);
 
 	inet_twsk_put(tw);
@@ -694,7 +698,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 			req->ts_recent,
 			0,
-			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr));
+			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
+			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 }
 
 /*
@@ -1244,6 +1249,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq->loc_addr = daddr;
 	ireq->rmt_addr = saddr;
+	ireq->no_srccheck = inet_sk(sk)->transparent;
 	ireq->opt = tcp_v4_save_options(sk, skb);
 	if (!want_cookie)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
-- 
cgit v1.2.3


From 86b08d867d7de001ab224180ed7865fab93fd56e Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 1 Oct 2008 07:44:42 -0700
Subject: ipv4: Make Netfilter's ip_route_me_harder() non-local address
 compatible

Netfilter's ip_route_me_harder() tries to re-route packets either
generated or re-routed by Netfilter. This patch changes
ip_route_me_harder() to handle packets from non-locally-bound sockets
with IP_TRANSPARENT set as local and to set the appropriate flowi
flags when re-doing the routing lookup.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 1 +
 net/ipv4/ip_output.c            | 4 +++-
 net/ipv4/netfilter.c            | 3 +++
 net/ipv4/syncookies.c           | 2 ++
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0c1ae68ee84..432c570c9f5 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -335,6 +335,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
 					.saddr = ireq->loc_addr,
 					.tos = RT_CONN_FLAGS(sk) } },
 			    .proto = sk->sk_protocol,
+			    .flags = inet_sk_flowi_flags(sk),
 			    .uli_u = { .ports =
 				       { .sport = inet_sk(sk)->sport,
 					 .dport = ireq->rmt_port } } };
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d533a89e08d..d2a8f8bb78a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -340,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 							.saddr = inet->saddr,
 							.tos = RT_CONN_FLAGS(sk) } },
 					    .proto = sk->sk_protocol,
+					    .flags = inet_sk_flowi_flags(sk),
 					    .uli_u = { .ports =
 						       { .sport = inet->sport,
 							 .dport = inet->dport } } };
@@ -1371,7 +1372,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 				    .uli_u = { .ports =
 					       { .sport = tcp_hdr(skb)->dest,
 						 .dport = tcp_hdr(skb)->source } },
-				    .proto = sk->sk_protocol };
+				    .proto = sk->sk_protocol,
+				    .flags = ip_reply_arg_flowi_flags(arg) };
 		security_skb_classify_flow(skb, &fl);
 		if (ip_route_output_key(sock_net(sk), &rt, &fl))
 			return;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index f8edacdf991..01671ad51ed 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -20,6 +20,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 	unsigned int type;
 
 	type = inet_addr_type(&init_net, iph->saddr);
+	if (skb->sk && inet_sk(skb->sk)->transparent)
+		type = RTN_LOCAL;
 	if (addr_type == RTN_UNSPEC)
 		addr_type = type;
 
@@ -33,6 +35,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
 		fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
 		fl.mark = skb->mark;
+		fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
 		if (ip_route_output_key(&init_net, &rt, &fl) != 0)
 			return -1;
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 9d38005abba..929302b2ba9 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -16,6 +16,7 @@
 #include <linux/cryptohash.h>
 #include <linux/kernel.h>
 #include <net/tcp.h>
+#include <net/route.h>
 
 /* Timestamps: lowest 9 bits store TCP options */
 #define TSBITS 9
@@ -337,6 +338,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 						.saddr = ireq->loc_addr,
 						.tos = RT_CONN_FLAGS(sk) } },
 				    .proto = IPPROTO_TCP,
+				    .flags = inet_sk_flowi_flags(sk),
 				    .uli_u = { .ports =
 					       { .sport = th->dest,
 						 .dport = th->source } } };
-- 
cgit v1.2.3


From a3116ac5c216fc3c145906a46df9ce542ff7dcf2 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 1 Oct 2008 07:46:49 -0700
Subject: tcp: Port redirection support for TCP

Current TCP code relies on the local port of the listening socket
being the same as the destination address of the incoming
connection. Port redirection used by many transparent proxying
techniques obviously breaks this, so we have to store the original
destination port address.

This patch extends struct inet_request_sock and stores the incoming
destination port value there. It also modifies the handshake code to
use that value as the source port when sending reply packets.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 2 ++
 net/ipv4/syncookies.c           | 1 +
 net/ipv4/tcp_output.c           | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 432c570c9f5..21fcc5a9045 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -516,6 +516,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
 		newicsk->icsk_bind_hash = NULL;
 
 		inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+		inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);
+		inet_sk(newsk)->sport = inet_rsk(req)->loc_port;
 		newsk->sk_write_space = sk_stream_write_space;
 
 		newicsk->icsk_retransmits = 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 929302b2ba9..d346c22aa6a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -297,6 +297,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	treq->rcv_isn		= ntohl(th->seq) - 1;
 	treq->snt_isn		= cookie;
 	req->mss		= mss;
+	ireq->loc_port		= th->dest;
 	ireq->rmt_port		= th->source;
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a8499ef3234..493553c71d3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2275,7 +2275,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	th->syn = 1;
 	th->ack = 1;
 	TCP_ECN_make_synack(req, th);
-	th->source = inet_sk(sk)->sport;
+	th->source = ireq->loc_port;
 	th->dest = ireq->rmt_port;
 	/* Setting of flags are superfluous here for callers (and ECE is
 	 * not even correctly set)
-- 
cgit v1.2.3


From bcd41303f422015ab662c9276d108414aa75b796 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 1 Oct 2008 07:48:10 -0700
Subject: udp: Export UDP socket lookup function

The iptables tproxy code has to be able to do UDP socket hash lookups,
so we have to provide an exported lookup function for this purpose.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 57e26fa6618..c83d0ef469c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -302,6 +302,13 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	return result;
 }
 
+struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+			     __be32 daddr, __be16 dport, int dif)
+{
+	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+
 static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 					     __be16 loc_port, __be32 loc_addr,
 					     __be16 rmt_port, __be32 rmt_addr,
-- 
cgit v1.2.3


From 2cd9b822bfa79fc1335d3e71a0449f3cd0b5078e Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Thu, 19 Jun 2008 17:59:13 -0400
Subject: sctp: Only mark chunks as missing when there are gaps

Frist small step in optimizing SACK processing.   Do not call
sctp_mark_missing() when there are no gaps reported and thus
not missing chunks.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/outqueue.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4328ad5439c..ef5ea7423cc 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1129,12 +1129,13 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 	unsigned outstanding;
 	struct sctp_transport *primary = asoc->peer.primary_path;
 	int count_of_newacks = 0;
+	int gap_ack_blocks;
 
 	/* Grab the association's destination address list. */
 	transport_list = &asoc->peer.transport_addr_list;
 
 	sack_ctsn = ntohl(sack->cum_tsn_ack);
-
+	gap_ack_blocks = ntohs(sack->num_gap_ack_blocks);
 	/*
 	 * SFR-CACC algorithm:
 	 * On receipt of a SACK the sender SHOULD execute the
@@ -1161,7 +1162,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 	 * A) Initialize the cacc_saw_newack to 0 for all destination
 	 * addresses.
 	 */
-	if (sack->num_gap_ack_blocks &&
+	if (gap_ack_blocks &&
 	    primary->cacc.changeover_active) {
 		list_for_each_entry(transport, transport_list, transports) {
 			transport->cacc.cacc_saw_newack = 0;
@@ -1170,9 +1171,8 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 
 	/* Get the highest TSN in the sack. */
 	highest_tsn = sack_ctsn;
-	if (sack->num_gap_ack_blocks)
-		highest_tsn +=
-		    ntohs(frags[ntohs(sack->num_gap_ack_blocks) - 1].gab.end);
+	if (gap_ack_blocks)
+		highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end);
 
 	if (TSN_lt(asoc->highest_sacked, highest_tsn)) {
 		highest_new_tsn = highest_tsn;
@@ -1181,11 +1181,11 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 		highest_new_tsn = sctp_highest_new_tsn(sack, asoc);
 	}
 
+
 	/* Run through the retransmit queue.  Credit bytes received
 	 * and free those chunks that we can.
 	 */
 	sctp_check_transmitted(q, &q->retransmit, NULL, sack, highest_new_tsn);
-	sctp_mark_missing(q, &q->retransmit, NULL, highest_new_tsn, 0);
 
 	/* Run through the transmitted queue.
 	 * Credit bytes received and free those chunks which we can.
@@ -1204,9 +1204,12 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 			count_of_newacks ++;
 	}
 
-	list_for_each_entry(transport, transport_list, transports) {
-		sctp_mark_missing(q, &transport->transmitted, transport,
-				  highest_new_tsn, count_of_newacks);
+	if (gap_ack_blocks) {
+		sctp_mark_missing(q, &q->retransmit, NULL, highest_new_tsn, 0);
+
+		list_for_each_entry(transport, transport_list, transports)
+			sctp_mark_missing(q, &transport->transmitted, transport,
+					  highest_new_tsn, count_of_newacks);
 	}
 
 	/* Move the Cumulative TSN Ack Point if appropriate.  */
-- 
cgit v1.2.3


From ab5216a5bd453752f04bb79c29e8f01b11d69006 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Thu, 19 Jun 2008 18:17:24 -0400
Subject: sctp: Optimize SFR-CACC transport list walking during SACK processing

There is a possibility of walking the transport list twice during
SACK processing when doing SFR-CACC algorithm.  We can restructure
the code to only do this once.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/outqueue.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index ef5ea7423cc..c8de4da57f3 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1145,27 +1145,31 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 	 * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be
 	 * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for
 	 * all destinations.
-	 */
-	if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) {
-		primary->cacc.changeover_active = 0;
-		list_for_each_entry(transport, transport_list,
-				transports) {
-			transport->cacc.cycling_changeover = 0;
-		}
-	}
-
-	/*
-	 * SFR-CACC algorithm:
 	 * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE
 	 * is set the receiver of the SACK MUST take the following actions:
 	 *
 	 * A) Initialize the cacc_saw_newack to 0 for all destination
 	 * addresses.
+	 *
+	 * Only bother if changeover_active is set. Otherwise, this is
+	 * totally suboptimal to do on every SACK.
 	 */
-	if (gap_ack_blocks &&
-	    primary->cacc.changeover_active) {
-		list_for_each_entry(transport, transport_list, transports) {
-			transport->cacc.cacc_saw_newack = 0;
+	if (primary->cacc.changeover_active) {
+		u8 clear_cycling = 0;
+
+		if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) {
+			primary->cacc.changeover_active = 0;
+			clear_cycling = 1;
+		}
+
+		if (clear_cycling || gap_ack_blocks) {
+			list_for_each_entry(transport, transport_list,
+					transports) {
+				if (clear_cycling)
+					transport->cacc.cycling_changeover = 0;
+				if (gap_ack_blocks)
+					transport->cacc.cacc_saw_newack = 0;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From 845b8eda4d783a7ce2670d482a716840a650389e Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 23 Jun 2008 15:26:20 -0400
Subject: sctp: Retransmit list is ineligable for missing indications

Chunks placed on the retransmit list are marked as inelegible
for fast retrasnmission.   Since missing indications determine
when fast reransmission is done, there is not point in calling
sctp_mark_missing() on the retransmit list since those chunks
will not be marked.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/outqueue.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index c8de4da57f3..da8d846301c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1209,8 +1209,6 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 	}
 
 	if (gap_ack_blocks) {
-		sctp_mark_missing(q, &q->retransmit, NULL, highest_new_tsn, 0);
-
 		list_for_each_entry(transport, transport_list, transports)
 			sctp_mark_missing(q, &transport->transmitted, transport,
 					  highest_new_tsn, count_of_newacks);
-- 
cgit v1.2.3


From c226ef9b83694311327f3ab0036c6de9c22e9daf Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 25 Jul 2008 12:44:09 -0400
Subject: sctp: reduce memory footprint of sctp_chunk structure

sctp_chunks should be put on a diet.  This is some of the low hanging
fruit that we can strip out.  Changes all the __s8/__u8 flags to
bitfields.  Saves 12 bytes per chunk.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/output.c        |  2 +-
 net/sctp/outqueue.c      | 14 +++++++-------
 net/sctp/sm_make_chunk.c |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 225c7123c41..c3f417f7ec6 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -699,7 +699,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
 	 *    When a Fast Retransmit is being performed the sender SHOULD
 	 *    ignore the value of cwnd and SHOULD NOT delay retransmission.
 	 */
-	if (chunk->fast_retransmit <= 0)
+	if (chunk->fast_retransmit != SCTP_NEED_FRTX)
 		if (transport->flight_size >= transport->cwnd) {
 			retval = SCTP_XMIT_RWND_FULL;
 			goto finish;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index da8d846301c..247ebc95c1e 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -420,7 +420,7 @@ void sctp_retransmit_mark(struct sctp_outq *q,
 		 * be added to the retransmit queue.
 		 */
 		if ((reason == SCTP_RTXR_FAST_RTX  &&
-			    (chunk->fast_retransmit > 0)) ||
+			    (chunk->fast_retransmit == SCTP_NEED_FRTX)) ||
 		    (reason != SCTP_RTXR_FAST_RTX  && !chunk->tsn_gap_acked)) {
 			/* If this chunk was sent less then 1 rto ago, do not
 			 * retransmit this chunk, but give the peer time
@@ -650,8 +650,8 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			/* Mark the chunk as ineligible for fast retransmit
 			 * after it is retransmitted.
 			 */
-			if (chunk->fast_retransmit > 0)
-				chunk->fast_retransmit = -1;
+			if (chunk->fast_retransmit == SCTP_NEED_FRTX)
+				chunk->fast_retransmit = SCTP_DONT_FRTX;
 
 			/* Force start T3-rtx timer when fast retransmitting
 			 * the earliest outstanding TSN
@@ -680,8 +680,8 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 	 */
 	if (rtx_timeout || fast_rtx) {
 		list_for_each_entry(chunk1, lqueue, transmitted_list) {
-			if (chunk1->fast_retransmit > 0)
-				chunk1->fast_retransmit = -1;
+			if (chunk1->fast_retransmit == SCTP_NEED_FRTX)
+				chunk1->fast_retransmit = SCTP_DONT_FRTX;
 		}
 	}
 
@@ -1656,7 +1656,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
 		 * chunk if it has NOT been fast retransmitted or marked for
 		 * fast retransmit already.
 		 */
-		if (!chunk->fast_retransmit &&
+		if (chunk->fast_retransmit == SCTP_CAN_FRTX &&
 		    !chunk->tsn_gap_acked &&
 		    TSN_lt(tsn, highest_new_tsn_in_sack)) {
 
@@ -1681,7 +1681,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
 		 */
 
 		if (chunk->tsn_missing_report >= 3) {
-			chunk->fast_retransmit = 1;
+			chunk->fast_retransmit = SCTP_NEED_FRTX;
 			do_fast_retransmit = 1;
 		}
 	}
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index d68869f966c..99fe0747cc9 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1211,7 +1211,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
 	 */
 	retval->tsn_missing_report = 0;
 	retval->tsn_gap_acked = 0;
-	retval->fast_retransmit = 0;
+	retval->fast_retransmit = SCTP_CAN_FRTX;
 
 	/* If this is a fragmented message, track all fragments
 	 * of the message (for SEND_FAILED).
-- 
cgit v1.2.3


From 52cae8f06babf9eed327479c1aa024ce3732f912 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 18 Aug 2008 10:34:34 -0400
Subject: sctp: try harder to figure out address family when checking wildcards

sctp_is_any() function that is used to check for wildcard addresses
only looks at the address itself to determine the address family.
This function is used in the API to check the address passed in from
the user.  If the user simply zerroes out the sockaddr_storage and
pass that in, we'll end up failing.  So, let's try harder to determine
the address family by also checking the socket if it's possible.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/bind_addr.c | 16 +++++++++++++---
 net/sctp/ipv6.c      |  5 +++--
 net/sctp/socket.c    | 10 +++++-----
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f62bc246893..6d5944a745d 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -457,7 +457,7 @@ static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
 {
 	int error = 0;
 
-	if (sctp_is_any(addr)) {
+	if (sctp_is_any(NULL, addr)) {
 		error = sctp_copy_local_addr_list(dest, scope, gfp, flags);
 	} else if (sctp_in_scope(addr, scope)) {
 		/* Now that the address is in scope, check to see if
@@ -477,11 +477,21 @@ static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
 }
 
 /* Is this a wildcard address?  */
-int sctp_is_any(const union sctp_addr *addr)
+int sctp_is_any(struct sock *sk, const union sctp_addr *addr)
 {
-	struct sctp_af *af = sctp_get_af_specific(addr->sa.sa_family);
+	unsigned short fam = 0;
+	struct sctp_af *af;
+
+	/* Try to get the right address family */
+	if (addr->sa.sa_family != AF_UNSPEC)
+		fam = addr->sa.sa_family;
+	else if (sk)
+		fam = sk->sk_family;
+
+	af = sctp_get_af_specific(fam);
 	if (!af)
 		return 0;
+
 	return af->is_any(addr);
 }
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 47f91afa021..c78da3c9dd3 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -837,6 +837,7 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
 			       struct sctp_sock *opt)
 {
 	struct sctp_af *af1, *af2;
+	struct sock *sk = sctp_opt2sk(opt);
 
 	af1 = sctp_get_af_specific(addr1->sa.sa_family);
 	af2 = sctp_get_af_specific(addr2->sa.sa_family);
@@ -845,11 +846,11 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
 		return 0;
 
 	/* If the socket is IPv6 only, v4 addrs will not match */
-	if (__ipv6_only_sock(sctp_opt2sk(opt)) && af1 != af2)
+	if (__ipv6_only_sock(sk) && af1 != af2)
 		return 0;
 
 	/* Today, wildcard AF_INET/AF_INET6. */
-	if (sctp_is_any(addr1) || sctp_is_any(addr2))
+	if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2))
 		return 1;
 
 	if (addr1->sa.sa_family != addr2->sa.sa_family)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5ffb9dec1c3..a1b904529d5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2309,7 +2309,7 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk,
 	/* If an address other than INADDR_ANY is specified, and
 	 * no transport is found, then the request is invalid.
 	 */
-	if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+	if (!sctp_is_any(sk, ( union sctp_addr *)&params.spp_address)) {
 		trans = sctp_addr_id2transport(sk, &params.spp_address,
 					       params.spp_assoc_id);
 		if (!trans)
@@ -4062,7 +4062,7 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 	/* If an address other than INADDR_ANY is specified, and
 	 * no transport is found, then the request is invalid.
 	 */
-	if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+	if (!sctp_is_any(sk, ( union sctp_addr *)&params.spp_address)) {
 		trans = sctp_addr_id2transport(sk, &params.spp_address,
 					       params.spp_assoc_id);
 		if (!trans) {
@@ -4414,7 +4414,7 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
 	if (sctp_list_single_entry(&bp->address_list)) {
 		addr = list_entry(bp->address_list.next,
 				  struct sctp_sockaddr_entry, list);
-		if (sctp_is_any(&addr->a)) {
+		if (sctp_is_any(sk, &addr->a)) {
 			rcu_read_lock();
 			list_for_each_entry_rcu(addr,
 						&sctp_local_addr_list, list) {
@@ -4602,7 +4602,7 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 	if (sctp_list_single_entry(&bp->address_list)) {
 		addr = list_entry(bp->address_list.next,
 				  struct sctp_sockaddr_entry, list);
-		if (sctp_is_any(&addr->a)) {
+		if (sctp_is_any(sk, &addr->a)) {
 			cnt = sctp_copy_laddrs_old(sk, bp->port,
 						   getaddrs.addr_num,
 						   addrs, &bytes_copied);
@@ -4695,7 +4695,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 	if (sctp_list_single_entry(&bp->address_list)) {
 		addr = list_entry(bp->address_list.next,
 				  struct sctp_sockaddr_entry, list);
-		if (sctp_is_any(&addr->a)) {
+		if (sctp_is_any(sk, &addr->a)) {
 			cnt = sctp_copy_laddrs(sk, bp->port, addrs,
 						space_left, &bytes_copied);
 			if (cnt < 0) {
-- 
cgit v1.2.3


From 536428a9b9a98495f7ea54ad95cad83e22f1d47d Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 5 Sep 2008 08:55:26 +0800
Subject: sctp: Fix to start T5-shutdown-guard timer while enter SHUTDOWN-SENT
 state

RFC 4960: Section 9.2
The sender of the SHUTDOWN MAY also start an overall guard timer
'T5-shutdown-guard' to bound the overall time for the shutdown
sequence.  At the expiration of this timer, the sender SHOULD abort
the association by sending an ABORT chunk.  If the 'T5-shutdown-
guard' timer is used, it SHOULD be set to the recommended value of 5
times 'RTO.Max'.

The timer 'T5-shutdown-guard' is used to counter the overall time
for shutdown sequence, and it's start by the sender of the SHUTDOWN.
So timer 'T5-shutdown-guard' should be start when we send the first
SHUTDOWN chunk and enter the SHUTDOWN-SENT state, not start when we
receipt of the SHUTDOWN primitive and enter SHUTDOWN-PENDING state.

If 'T5-shutdown-guard' timer is start at SHUTDOWN-PENDING state, the
association may be ABORT while data is still transmitting.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/sm_statefuns.c  | 24 +++++++-----------------
 net/sctp/sm_statetable.c |  2 +-
 2 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 7c622af2ce5..e57d1d3ad0a 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2076,10 +2076,6 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort(
 		    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
 		return sctp_sf_discard_chunk(ep, asoc, type, arg, commands);
 
-	/* Stop the T5-shutdown guard timer.  */
-	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
-			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
-
 	return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands);
 }
 
@@ -4543,13 +4539,6 @@ sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
 	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
 			SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));
 
-	/* sctpimpguide-05 Section 2.12.2
-	 * The sender of the SHUTDOWN MAY also start an overall guard timer
-	 * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
-	 */
-	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
-			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
-
 	disposition = SCTP_DISPOSITION_CONSUME;
 	if (sctp_outq_is_empty(&asoc->outqueue)) {
 		disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type,
@@ -4994,6 +4983,13 @@ sctp_disposition_t sctp_sf_do_9_2_start_shutdown(
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
 			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
 
+	/* RFC 4960 Section 9.2
+	 * The sender of the SHUTDOWN MAY also start an overall guard timer
+	 * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
 	if (asoc->autoclose)
 		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
 				SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
@@ -5520,12 +5516,6 @@ sctp_disposition_t sctp_sf_autoclose_timer_expire(
 	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
 			SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));
 
-	/* sctpimpguide-05 Section 2.12.2
-	 * The sender of the SHUTDOWN MAY also start an overall guard timer
-	 * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
-	 */
-	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
-			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
 	disposition = SCTP_DISPOSITION_CONSUME;
 	if (sctp_outq_is_empty(&asoc->outqueue)) {
 		disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type,
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index d991237fb40..dd4ddc40c0a 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -897,7 +897,7 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
 	/* SCTP_STATE_ESTABLISHED */ \
 	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
 	/* SCTP_STATE_SHUTDOWN_PENDING */ \
-	TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
 	/* SCTP_STATE_SHUTDOWN_SENT */ \
 	TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
 	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
-- 
cgit v1.2.3


From 8190f89dfd09dae0c117fb0745f5a820bd19a5a4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 8 Sep 2008 12:13:55 +0800
Subject: sctp: Fix the SNMP counter of SCTP_MIB_OUTOFBLUES

RFC3873 defined SCTP_MIB_OUTOFBLUES:

 sctpOutOfBlues OBJECT-TYPE
   SYNTAX         Counter32
   MAX-ACCESS     read-only
   STATUS         current
   DESCRIPTION
        "The number of out of the blue packets received by the host.
        An out of the blue packet is an SCTP packet correctly formed,
        including the proper checksum, but for which the receiver was
        unable to identify an appropriate association."
   REFERENCE
        "Section 8.4 in RFC2960 deals with the Out-Of-The-Blue
         (OOTB) packet definition and procedures."

But OOTB packet INIT, INIT-ACK and SHUTDOWN-ACK(COOKIE-WAIT or
COOKIE-ECHOED state) are not counted by SCTP_MIB_OUTOFBLUES.

Case 1(INIT):

Endpoint A               Endpoint B
(CLOSED)                 (CLOSED)

 INIT     ---------->
          <----------    ABORT

Case 2(INIT-ACK):

Endpoint A               Endpoint B
(CLOSED)                 (CLOSED)

 INIT-ACK  ---------->
           <----------   ABORT

Case 3(SHUTDOWN-ACK):

Endpoint A               Endpoint B
(CLOSED)                 (CLOSED)

          <----------    INIT
 SHUTDOWN-ACK  ---------->
           <----------   SHUTDOWN-COMPLETE

Case 4(SHUTDOWN-ACK):

Endpoint A               Endpoint B
(CLOSED)                 (COOKIE-ECHOED)

 SHUTDOWN-ACK  ---------->
           <----------   SHUTDOWN-COMPLETE

This patch fixed the problem.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/sm_statefuns.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index e57d1d3ad0a..81dfaee49b7 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -315,8 +315,10 @@ sctp_disposition_t sctp_sf_do_5_1B_init(const struct sctp_endpoint *ep,
 	/* If the packet is an OOTB packet which is temporarily on the
 	 * control endpoint, respond with an ABORT.
 	 */
-	if (ep == sctp_sk((sctp_get_ctl_sock()))->ep)
+	if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) {
+		SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES);
 		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+	}
 
 	/* 3.1 A packet containing an INIT chunk MUST have a zero Verification
 	 * Tag.
@@ -635,8 +637,10 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(const struct sctp_endpoint *ep,
 	/* If the packet is an OOTB packet which is temporarily on the
 	 * control endpoint, respond with an ABORT.
 	 */
-	if (ep == sctp_sk((sctp_get_ctl_sock()))->ep)
+	if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) {
+		SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES);
 		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+	}
 
 	/* Make sure that the COOKIE_ECHO chunk has a valid length.
 	 * In this case, we check that we have enough for at least a
@@ -3378,6 +3382,8 @@ sctp_disposition_t sctp_sf_do_8_5_1_E_sa(const struct sctp_endpoint *ep,
 	 * packet and the state function that handles OOTB SHUTDOWN_ACK is
 	 * called with a NULL association.
 	 */
+	SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES);
+
 	return sctp_sf_shut_8_4_5(ep, NULL, type, arg, commands);
 }
 
-- 
cgit v1.2.3


From 96cd0d3d710e64c55e034b77052d7ac46f094759 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 8 Sep 2008 14:00:26 -0400
Subject: sctp: enable cookie-echo retransmission transport switch

This patch enables cookie-echo retransmission transport switch
feature. If COOKIE-ECHO retransmission happens, it will be sent
to the address other than the one last sent to.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/sm_sideeffect.c | 78 +++++++++++++++++++++++++++---------------------
 net/sctp/sm_statefuns.c  |  2 ++
 2 files changed, 46 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 9732c797e8e..13d9eea5cf1 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -889,6 +889,35 @@ static void sctp_cmd_adaptation_ind(sctp_cmd_seq_t *commands,
 		sctp_ulpq_tail_event(&asoc->ulpq, ev);
 }
 
+
+static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
+				    sctp_event_timeout_t timer,
+				    char *name)
+{
+	struct sctp_transport *t;
+
+	t = asoc->init_last_sent_to;
+	asoc->init_err_counter++;
+
+	if (t->init_sent_count > (asoc->init_cycle + 1)) {
+		asoc->timeouts[timer] *= 2;
+		if (asoc->timeouts[timer] > asoc->max_init_timeo) {
+			asoc->timeouts[timer] = asoc->max_init_timeo;
+		}
+		asoc->init_cycle++;
+		SCTP_DEBUG_PRINTK(
+			"T1 %s Timeout adjustment"
+			" init_err_counter: %d"
+			" cycle: %d"
+			" timeout: %ld\n",
+			name,
+			asoc->init_err_counter,
+			asoc->init_cycle,
+			asoc->timeouts[timer]);
+	}
+
+}
+
 /* These three macros allow us to pull the debugging code out of the
  * main flow of sctp_do_sm() to keep attention focused on the real
  * functionality there.
@@ -1196,6 +1225,11 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
 						SCTP_CHUNK(cmd->obj.ptr));
 
+			if (new_obj->transport) {
+				new_obj->transport->init_sent_count++;
+				asoc->init_last_sent_to = new_obj->transport;
+			}
+
 			/* FIXME - Eventually come up with a cleaner way to
 			 * enabling COOKIE-ECHO + DATA bundling during
 			 * multihoming stale cookie scenarios, the following
@@ -1345,26 +1379,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 			 * all transports have been tried at the current
 			 * timeout.
 			 */
-			t = asoc->init_last_sent_to;
-			asoc->init_err_counter++;
-
-			if (t->init_sent_count > (asoc->init_cycle + 1)) {
-				asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] *= 2;
-				if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] >
-				    asoc->max_init_timeo) {
-					asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
-						asoc->max_init_timeo;
-				}
-				asoc->init_cycle++;
-				SCTP_DEBUG_PRINTK(
-					"T1 INIT Timeout adjustment"
-					" init_err_counter: %d"
-					" cycle: %d"
-					" timeout: %ld\n",
-					asoc->init_err_counter,
-					asoc->init_cycle,
-					asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]);
-			}
+			sctp_cmd_t1_timer_update(asoc,
+						SCTP_EVENT_TIMEOUT_T1_INIT,
+						"INIT");
 
 			sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
 					SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
@@ -1377,20 +1394,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 			 * all transports have been tried at the current
 			 * timeout.
 			 */
-			asoc->init_err_counter++;
-
-			asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] *= 2;
-			if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] >
-			    asoc->max_init_timeo) {
-				asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
-					asoc->max_init_timeo;
-			}
-			SCTP_DEBUG_PRINTK(
-				"T1 COOKIE Timeout adjustment"
-				" init_err_counter: %d"
-				" timeout: %ld\n",
-				asoc->init_err_counter,
-				asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]);
+			sctp_cmd_t1_timer_update(asoc,
+						SCTP_EVENT_TIMEOUT_T1_COOKIE,
+						"COOKIE");
 
 			/* If we've sent any data bundled with
 			 * COOKIE-ECHO we need to resend.
@@ -1422,6 +1428,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_INIT_COUNTER_RESET:
 			asoc->init_err_counter = 0;
 			asoc->init_cycle = 0;
+			list_for_each_entry(t, &asoc->peer.transport_addr_list,
+					    transports) {
+				t->init_sent_count = 0;
+			}
 			break;
 
 		case SCTP_CMD_REPORT_DUP:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 81dfaee49b7..ea3a34cbe47 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5307,6 +5307,8 @@ sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep
 		if (!repl)
 			return SCTP_DISPOSITION_NOMEM;
 
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+				SCTP_CHUNK(repl));
 		/* Issue a sideeffect to do the needed accounting. */
 		sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
 				SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
-- 
cgit v1.2.3


From e69c4e0f1210450841e40716894ba6a877b31d52 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 15 Sep 2008 16:29:49 -0400
Subject: sctp: correctly save sctp_adaptation from parameter.

The INIT perameter carries the adapatation value in network-byte
order.  We need to store it in host byte order as expected
by data types and the user API.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 net/sctp/sm_make_chunk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 99fe0747cc9..76726bcff3e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2467,7 +2467,7 @@ do_addr_param:
 		break;
 
 	case SCTP_PARAM_ADAPTATION_LAYER_IND:
-		asoc->peer.adaptation_ind = param.aind->adaptation_ind;
+		asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind);
 		break;
 
 	case SCTP_PARAM_SET_PRIMARY:
-- 
cgit v1.2.3


From 2937391385807b3da9cd7a39345259caf550b032 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 3 Oct 2008 17:15:38 -0400
Subject: NLM: Remove unused argument from svc_addsock() function

Clean up: The svc_addsock() function no longer uses its "proto"
argument, so remove it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svcsock.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f91377c1495..95293f549e9 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1167,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 
 int svc_addsock(struct svc_serv *serv,
 		int fd,
-		char *name_return,
-		int *proto)
+		char *name_return)
 {
 	int err = 0;
 	struct socket *so = sockfd_lookup(fd, &err);
@@ -1203,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv,
 		sockfd_put(so);
 		return err;
 	}
-	if (proto) *proto = so->sk->sk_protocol;
 	return one_sock_name(name_return, svsk);
 }
 EXPORT_SYMBOL_GPL(svc_addsock);
-- 
cgit v1.2.3


From 25532824fb727744a302edb25c6a6ac10b82cb63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi.denis-courmont@nokia.com>
Date: Sun, 5 Oct 2008 11:14:27 -0700
Subject: Phonet: modules auto-loading support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/af_phonet.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 1d8df6b7e3d..0a74aeaf5ad 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -64,6 +64,11 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
 	}
 
 	pnp = phonet_proto_get(protocol);
+#ifdef CONFIG_KMOD
+	if (pnp == NULL &&
+	    request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0)
+		pnp = phonet_proto_get(protocol);
+#endif
 	if (pnp == NULL)
 		return -EPROTONOSUPPORT;
 	if (sock->type != pnp->sock_type) {
@@ -94,7 +99,7 @@ out:
 }
 
 static struct net_proto_family phonet_proto_family = {
-	.family = AF_PHONET,
+	.family = PF_PHONET,
 	.create = pn_socket_create,
 	.owner = THIS_MODULE,
 };
@@ -447,7 +452,7 @@ static int __init phonet_init(void)
 
 err:
 	phonet_sysctl_exit();
-	sock_unregister(AF_PHONET);
+	sock_unregister(PF_PHONET);
 	dev_remove_pack(&phonet_packet_type);
 	phonet_device_exit();
 	return err;
@@ -457,7 +462,7 @@ static void __exit phonet_exit(void)
 {
 	isi_unregister();
 	phonet_sysctl_exit();
-	sock_unregister(AF_PHONET);
+	sock_unregister(PF_PHONET);
 	dev_remove_pack(&phonet_packet_type);
 	phonet_device_exit();
 }
@@ -466,3 +471,4 @@ module_init(phonet_init);
 module_exit(phonet_exit);
 MODULE_DESCRIPTION("Phonet protocol stack for Linux");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_PHONET);
-- 
cgit v1.2.3


From 9995a32b4d14dcda2f8df58030526bee91114c16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi.denis-courmont@nokia.com>
Date: Sun, 5 Oct 2008 11:14:48 -0700
Subject: Phonet: connected sockets glue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/socket.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

(limited to 'net')

diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index dfd4061646d..cea1136cbe4 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -25,11 +25,13 @@
 
 #include <linux/kernel.h>
 #include <linux/net.h>
+#include <linux/poll.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
 
 #include <linux/phonet.h>
 #include <net/phonet/phonet.h>
+#include <net/phonet/pep.h>
 #include <net/phonet/pn_dev.h>
 
 static int pn_socket_release(struct socket *sock)
@@ -166,6 +168,24 @@ static int pn_socket_autobind(struct socket *sock)
 	return 0; /* socket was already bound */
 }
 
+static int pn_socket_accept(struct socket *sock, struct socket *newsock,
+				int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sock *newsk;
+	int err;
+
+	newsk = sk->sk_prot->accept(sk, flags, &err);
+	if (!newsk)
+		return err;
+
+	lock_sock(newsk);
+	sock_graft(newsk, newsock);
+	newsock->state = SS_CONNECTED;
+	release_sock(newsk);
+	return 0;
+}
+
 static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
 				int *sockaddr_len, int peer)
 {
@@ -182,6 +202,33 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
 	return 0;
 }
 
+static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
+					poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct pep_sock *pn = pep_sk(sk);
+	unsigned int mask = 0;
+
+	poll_wait(file, &sock->wait, wait);
+
+	switch (sk->sk_state) {
+	case TCP_LISTEN:
+		return hlist_empty(&pn->ackq) ? 0 : POLLIN;
+	case TCP_CLOSE:
+		return POLLERR;
+	}
+
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+	else if (sk->sk_state == TCP_CLOSE_WAIT)
+		return POLLHUP;
+
+	if (sk->sk_state == TCP_ESTABLISHED && pn->tx_credits)
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
 static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
 				unsigned long arg)
 {
@@ -220,6 +267,30 @@ static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
 	return sk->sk_prot->ioctl(sk, cmd, arg);
 }
 
+static int pn_socket_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	if (sock->state != SS_UNCONNECTED)
+		return -EINVAL;
+	if (pn_socket_autobind(sock))
+		return -ENOBUFS;
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_CLOSE) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	sk->sk_state = TCP_LISTEN;
+	sk->sk_ack_backlog = 0;
+	sk->sk_max_ack_backlog = backlog;
+out:
+	release_sock(sk);
+	return err;
+}
+
 static int pn_socket_sendmsg(struct kiocb *iocb, struct socket *sock,
 				struct msghdr *m, size_t total_len)
 {
@@ -256,6 +327,32 @@ const struct proto_ops phonet_dgram_ops = {
 	.sendpage	= sock_no_sendpage,
 };
 
+const struct proto_ops phonet_stream_ops = {
+	.family		= AF_PHONET,
+	.owner		= THIS_MODULE,
+	.release	= pn_socket_release,
+	.bind		= pn_socket_bind,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= pn_socket_accept,
+	.getname	= pn_socket_getname,
+	.poll		= pn_socket_poll,
+	.ioctl		= pn_socket_ioctl,
+	.listen		= pn_socket_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = sock_no_setsockopt,
+	.compat_getsockopt = compat_sock_no_getsockopt,
+#endif
+	.sendmsg	= pn_socket_sendmsg,
+	.recvmsg	= sock_common_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+EXPORT_SYMBOL(phonet_stream_ops);
+
 static DEFINE_MUTEX(port_mutex);
 
 /* allocate port for a socket */
-- 
cgit v1.2.3


From 9641458d3ec42def729fde64669abf07f3220cd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi.denis-courmont@nokia.com>
Date: Sun, 5 Oct 2008 11:15:13 -0700
Subject: Phonet: Pipe End Point for Phonet Pipes protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This protocol provides some connection handling and negotiated
congestion control. Nokia cellular modems use it for bulk transfers.
It provides packet boundaries (hence SOCK_SEQPACKET). Congestion
control is per packet rather per byte, so we do not re-use the
generic socket memory accounting.

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/Makefile    |   4 +-
 net/phonet/af_phonet.c |   3 +
 net/phonet/pep.c       | 908 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 914 insertions(+), 1 deletion(-)
 create mode 100644 net/phonet/pep.c

(limited to 'net')

diff --git a/net/phonet/Makefile b/net/phonet/Makefile
index ae9c3ed5be8..505df2a04a8 100644
--- a/net/phonet/Makefile
+++ b/net/phonet/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_PHONET) += phonet.o
+obj-$(CONFIG_PHONET) += phonet.o pn_pep.o
 
 phonet-objs := \
 	pn_dev.o \
@@ -7,3 +7,5 @@ phonet-objs := \
 	datagram.o \
 	sysctl.o \
 	af_phonet.o
+
+pn_pep-objs := pep.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 0a74aeaf5ad..9e9c6fce11a 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -58,6 +58,9 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
 		case SOCK_DGRAM:
 			protocol = PN_PROTO_PHONET;
 			break;
+		case SOCK_SEQPACKET:
+			protocol = PN_PROTO_PIPE;
+			break;
 		default:
 			return -EPROTONOSUPPORT;
 		}
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
new file mode 100644
index 00000000000..c5dfecb207d
--- /dev/null
+++ b/net/phonet/pep.c
@@ -0,0 +1,908 @@
+/*
+ * File: pep.c
+ *
+ * Phonet pipe protocol end point socket
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Author: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/ioctls.h>
+
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pep.h>
+
+/* sk_state values:
+ * TCP_CLOSE		sock not in use yet
+ * TCP_CLOSE_WAIT	disconnected pipe
+ * TCP_LISTEN		listening pipe endpoint
+ * TCP_SYN_RECV		connected pipe in disabled state
+ * TCP_ESTABLISHED	connected pipe in enabled state
+ *
+ * pep_sock locking:
+ *  - sk_state, ackq, hlist: sock lock needed
+ *  - listener: read only
+ *  - pipe_handle: read only
+ */
+
+#define CREDITS_MAX	10
+#define CREDITS_THR	7
+
+static const struct sockaddr_pn pipe_srv = {
+	.spn_family = AF_PHONET,
+	.spn_resource = 0xD9, /* pipe service */
+};
+
+#define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */
+
+/* Get the next TLV sub-block. */
+static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen,
+					void *buf)
+{
+	void *data = NULL;
+	struct {
+		u8 sb_type;
+		u8 sb_len;
+	} *ph, h;
+	int buflen = *plen;
+
+	ph = skb_header_pointer(skb, 0, 2, &h);
+	if (ph == NULL || ph->sb_len < 2 || !pskb_may_pull(skb, ph->sb_len))
+		return NULL;
+	ph->sb_len -= 2;
+	*ptype = ph->sb_type;
+	*plen = ph->sb_len;
+
+	if (buflen > ph->sb_len)
+		buflen = ph->sb_len;
+	data = skb_header_pointer(skb, 2, buflen, buf);
+	__skb_pull(skb, 2 + ph->sb_len);
+	return data;
+}
+
+static int pep_reply(struct sock *sk, struct sk_buff *oskb,
+			u8 code, const void *data, int len, gfp_t priority)
+{
+	const struct pnpipehdr *oph = pnp_hdr(oskb);
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
+	if (!skb)
+		return -ENOMEM;
+	skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, MAX_PNPIPE_HEADER);
+	__skb_put(skb, len);
+	skb_copy_to_linear_data(skb, data, len);
+	__skb_push(skb, sizeof(*ph));
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = oph->utid;
+	ph->message_id = oph->message_id + 1; /* REQ -> RESP */
+	ph->pipe_handle = oph->pipe_handle;
+	ph->error_code = code;
+
+	return pn_skb_send(sk, skb, &pipe_srv);
+}
+
+#define PAD 0x00
+static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
+{
+	static const u8 data[20] = {
+		PAD, PAD, PAD, 2 /* sub-blocks */,
+		PN_PIPE_SB_REQUIRED_FC_TX, pep_sb_size(5), 3, PAD,
+			PN_MULTI_CREDIT_FLOW_CONTROL,
+			PN_ONE_CREDIT_FLOW_CONTROL,
+			PN_LEGACY_FLOW_CONTROL,
+			PAD,
+		PN_PIPE_SB_PREFERRED_FC_RX, pep_sb_size(5), 3, PAD,
+			PN_MULTI_CREDIT_FLOW_CONTROL,
+			PN_ONE_CREDIT_FLOW_CONTROL,
+			PN_LEGACY_FLOW_CONTROL,
+			PAD,
+	};
+
+	might_sleep();
+	return pep_reply(sk, skb, PN_PIPE_NO_ERROR, data, sizeof(data),
+				GFP_KERNEL);
+}
+
+static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code)
+{
+	static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ };
+	WARN_ON(code == PN_PIPE_NO_ERROR);
+	return pep_reply(sk, skb, code, data, sizeof(data), GFP_ATOMIC);
+}
+
+/* Control requests are not sent by the pipe service and have a specific
+ * message format. */
+static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code)
+{
+	const struct pnpipehdr *oph = pnp_hdr(oskb);
+	struct sk_buff *skb;
+	struct pnpipehdr *ph;
+	struct sockaddr_pn dst;
+
+	skb = alloc_skb(MAX_PNPIPE_HEADER + 4, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+	skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, MAX_PHONET_HEADER);
+	ph = (struct pnpipehdr *)skb_put(skb, sizeof(*ph) + 4);
+
+	ph->utid = oph->utid;
+	ph->message_id = PNS_PEP_CTRL_RESP;
+	ph->pipe_handle = oph->pipe_handle;
+	ph->data[0] = oph->data[1]; /* CTRL id */
+	ph->data[1] = oph->data[0]; /* PEP type */
+	ph->data[2] = code; /* error code, at an usual offset */
+	ph->data[3] = PAD;
+	ph->data[4] = PAD;
+
+	pn_skb_get_src_sockaddr(oskb, &dst);
+	return pn_skb_send(sk, skb, &dst);
+}
+
+static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(MAX_PNPIPE_HEADER + 4, priority);
+	if (!skb)
+		return -ENOMEM;
+	skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, MAX_PNPIPE_HEADER + 4);
+	__skb_push(skb, sizeof(*ph) + 4);
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = 0;
+	ph->message_id = PNS_PEP_STATUS_IND;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->pep_type = PN_PEP_TYPE_COMMON;
+	ph->data[1] = type;
+	ph->data[2] = PAD;
+	ph->data[3] = PAD;
+	ph->data[4] = status;
+
+	return pn_skb_send(sk, skb, &pipe_srv);
+}
+
+/* Send our RX flow control information to the sender.
+ * Socket must be locked. */
+static void pipe_grant_credits(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	BUG_ON(sk->sk_state != TCP_ESTABLISHED);
+
+	switch (pn->rx_fc) {
+	case PN_LEGACY_FLOW_CONTROL: /* TODO */
+		break;
+	case PN_ONE_CREDIT_FLOW_CONTROL:
+		pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL,
+				PEP_IND_READY, GFP_ATOMIC);
+		pn->rx_credits = 1;
+		break;
+	case PN_MULTI_CREDIT_FLOW_CONTROL:
+		if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX)
+			break;
+		if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS,
+					CREDITS_MAX - pn->rx_credits,
+					GFP_ATOMIC) == 0)
+			pn->rx_credits = CREDITS_MAX;
+		break;
+	}
+}
+
+static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+
+	if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+		return -EINVAL;
+
+	if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
+		LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n",
+				(unsigned)hdr->data[0]);
+		return -EOPNOTSUPP;
+	}
+
+	switch (hdr->data[1]) {
+	case PN_PEP_IND_FLOW_CONTROL:
+		switch (pn->tx_fc) {
+		case PN_LEGACY_FLOW_CONTROL:
+			switch (hdr->data[4]) {
+			case PEP_IND_BUSY:
+				pn->tx_credits = 0;
+				break;
+			case PEP_IND_READY:
+				pn->tx_credits = 1;
+				break;
+			}
+			break;
+		case PN_ONE_CREDIT_FLOW_CONTROL:
+			if (hdr->data[4] == PEP_IND_READY)
+				pn->tx_credits = 1;
+			break;
+		}
+		break;
+
+	case PN_PEP_IND_ID_MCFC_GRANT_CREDITS:
+		if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL)
+			break;
+		if (pn->tx_credits + hdr->data[4] > 0xff)
+			pn->tx_credits = 0xff;
+		else
+			pn->tx_credits += hdr->data[4];
+		break;
+
+	default:
+		LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP indication: %u\n",
+				(unsigned)hdr->data[1]);
+		return -EOPNOTSUPP;
+	}
+	if (pn->tx_credits)
+		sk->sk_write_space(sk);
+	return 0;
+}
+
+static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+	u8 n_sb = hdr->data[0];
+
+	pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+	__skb_pull(skb, sizeof(*hdr));
+	while (n_sb > 0) {
+		u8 type, buf[2], len = sizeof(buf);
+		u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+		if (data == NULL)
+			return -EINVAL;
+		switch (type) {
+		case PN_PIPE_SB_NEGOTIATED_FC:
+			if (len < 2 || (data[0] | data[1]) > 3)
+				break;
+			pn->tx_fc = data[0] & 3;
+			pn->rx_fc = data[1] & 3;
+			break;
+		}
+		n_sb--;
+	}
+	return 0;
+}
+
+/* Queue an skb to a connected sock.
+ * Socket lock must be held. */
+static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+	int err = 0;
+
+	BUG_ON(sk->sk_state == TCP_CLOSE_WAIT);
+
+	switch (hdr->message_id) {
+	case PNS_PEP_CONNECT_REQ:
+		pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE);
+		break;
+
+	case PNS_PEP_DISCONNECT_REQ:
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		sk->sk_state = TCP_CLOSE_WAIT;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+		break;
+
+	case PNS_PEP_ENABLE_REQ:
+		/* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_RESET_REQ:
+		switch (hdr->state_after_reset) {
+		case PN_PIPE_DISABLE:
+			pn->init_enable = 0;
+			break;
+		case PN_PIPE_ENABLE:
+			pn->init_enable = 1;
+			break;
+		default: /* not allowed to send an error here!? */
+			err = -EINVAL;
+			goto out;
+		}
+		/* fall through */
+	case PNS_PEP_DISABLE_REQ:
+		pn->tx_credits = 0;
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_CTRL_REQ:
+		/* TODO */
+		pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR);
+		break;
+
+	case PNS_PIPE_DATA:
+		__skb_pull(skb, 3); /* Pipe data header */
+		if (!pn_flow_safe(pn->rx_fc)) {
+			err = sock_queue_rcv_skb(sk, skb);
+			if (!err)
+				return 0;
+			break;
+		}
+
+		if (pn->rx_credits == 0) {
+			err = -ENOBUFS;
+			break;
+		}
+		pn->rx_credits--;
+		skb->dev = NULL;
+		skb_set_owner_r(skb, sk);
+		err = skb->len;
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_data_ready(sk, err);
+		return 0;
+
+	case PNS_PEP_STATUS_IND:
+		pipe_rcv_status(sk, skb);
+		break;
+
+	case PNS_PIPE_REDIRECTED_IND:
+		err = pipe_rcv_created(sk, skb);
+		break;
+
+	case PNS_PIPE_CREATED_IND:
+		err = pipe_rcv_created(sk, skb);
+		if (err)
+			break;
+		/* fall through */
+	case PNS_PIPE_RESET_IND:
+		if (!pn->init_enable)
+			break;
+		/* fall through */
+	case PNS_PIPE_ENABLED_IND:
+		if (!pn_flow_safe(pn->tx_fc)) {
+			pn->tx_credits = 1;
+			sk->sk_write_space(sk);
+		}
+		if (sk->sk_state == TCP_ESTABLISHED)
+			break; /* Nothing to do */
+		sk->sk_state = TCP_ESTABLISHED;
+		pipe_grant_credits(sk);
+		break;
+
+	case PNS_PIPE_DISABLED_IND:
+		sk->sk_state = TCP_SYN_RECV;
+		pn->rx_credits = 0;
+		break;
+
+	default:
+		LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP message: %u\n",
+				hdr->message_id);
+		err = -EINVAL;
+	}
+out:
+	kfree_skb(skb);
+	return err;
+}
+
+/* Destroy connected sock. */
+static void pipe_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
+static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct sock *newsk;
+	struct pep_sock *newpn, *pn = pep_sk(sk);
+	struct pnpipehdr *hdr;
+	struct sockaddr_pn dst;
+	u16 peer_type;
+	u8 pipe_handle, enabled, n_sb;
+
+	if (!pskb_pull(skb, sizeof(*hdr) + 4))
+		return -EINVAL;
+
+	hdr = pnp_hdr(skb);
+	pipe_handle = hdr->pipe_handle;
+	switch (hdr->state_after_connect) {
+	case PN_PIPE_DISABLE:
+		enabled = 0;
+		break;
+	case PN_PIPE_ENABLE:
+		enabled = 1;
+		break;
+	default:
+		pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM);
+		return -EINVAL;
+	}
+	peer_type = hdr->other_pep_type << 8;
+
+	if (unlikely(sk->sk_state != TCP_LISTEN) || sk_acceptq_is_full(sk)) {
+		pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE);
+		return -ENOBUFS;
+	}
+
+	/* Parse sub-blocks (options) */
+	n_sb = hdr->data[4];
+	while (n_sb > 0) {
+		u8 type, buf[1], len = sizeof(buf);
+		const u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+		if (data == NULL)
+			return -EINVAL;
+		switch (type) {
+		case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE:
+			if (len < 1)
+				return -EINVAL;
+			peer_type = (peer_type & 0xff00) | data[0];
+			break;
+		}
+		n_sb--;
+	}
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	/* Create a new to-be-accepted sock */
+	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_ATOMIC, sk->sk_prot);
+	if (!newsk) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+	sock_init_data(NULL, newsk);
+	newsk->sk_state = TCP_SYN_RECV;
+	newsk->sk_backlog_rcv = pipe_do_rcv;
+	newsk->sk_protocol = sk->sk_protocol;
+	newsk->sk_destruct = pipe_destruct;
+
+	newpn = pep_sk(newsk);
+	pn_skb_get_dst_sockaddr(skb, &dst);
+	newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst);
+	newpn->pn_sk.resource = pn->pn_sk.resource;
+	newpn->pipe_handle = pipe_handle;
+	newpn->peer_type = peer_type;
+	newpn->rx_credits = newpn->tx_credits = 0;
+	newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+	newpn->init_enable = enabled;
+
+	BUG_ON(!skb_queue_empty(&newsk->sk_receive_queue));
+	skb_queue_head(&newsk->sk_receive_queue, skb);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, 0);
+
+	sk_acceptq_added(sk);
+	sk_add_node(newsk, &pn->ackq);
+	return 0;
+}
+
+/* Listening sock must be locked */
+static struct sock *pep_find_pipe(const struct hlist_head *hlist,
+					const struct sockaddr_pn *dst,
+					u8 pipe_handle)
+{
+	struct hlist_node *node;
+	struct sock *sknode;
+	u16 dobj = pn_sockaddr_get_object(dst);
+
+	sk_for_each(sknode, node, hlist) {
+		struct pep_sock *pnnode = pep_sk(sknode);
+
+		/* Ports match, but addresses might not: */
+		if (pnnode->pn_sk.sobject != dobj)
+			continue;
+		if (pnnode->pipe_handle != pipe_handle)
+			continue;
+		if (sknode->sk_state == TCP_CLOSE_WAIT)
+			continue;
+
+		sock_hold(sknode);
+		return sknode;
+	}
+	return NULL;
+}
+
+/*
+ * Deliver an skb to a listening sock.
+ * Socket lock must be held.
+ * We then queue the skb to the right connected sock (if any).
+ */
+static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sock *sknode;
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+	struct sockaddr_pn dst;
+	int err = NET_RX_SUCCESS;
+	u8 pipe_handle;
+
+	if (!pskb_may_pull(skb, sizeof(*hdr)))
+		goto drop;
+
+	hdr = pnp_hdr(skb);
+	pipe_handle = hdr->pipe_handle;
+	if (pipe_handle == PN_PIPE_INVALID_HANDLE)
+		goto drop;
+
+	pn_skb_get_dst_sockaddr(skb, &dst);
+
+	/* Look for an existing pipe handle */
+	sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
+	if (sknode)
+		return sk_receive_skb(sknode, skb, 1);
+
+	/* Look for a pipe handle pending accept */
+	sknode = pep_find_pipe(&pn->ackq, &dst, pipe_handle);
+	if (sknode) {
+		sock_put(sknode);
+		if (net_ratelimit())
+			printk(KERN_WARNING"Phonet unconnected PEP ignored");
+		err = NET_RX_DROP;
+		goto drop;
+	}
+
+	switch (hdr->message_id) {
+	case PNS_PEP_CONNECT_REQ:
+		err = pep_connreq_rcv(sk, skb);
+		break;
+
+	case PNS_PEP_DISCONNECT_REQ:
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_CTRL_REQ:
+		pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE);
+		break;
+
+	case PNS_PEP_RESET_REQ:
+	case PNS_PEP_ENABLE_REQ:
+	case PNS_PEP_DISABLE_REQ:
+		/* invalid handle is not even allowed here! */
+	default:
+		err = NET_RX_DROP;
+	}
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
+/* associated socket ceases to exist */
+static void pep_sock_close(struct sock *sk, long timeout)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	sk_common_release(sk);
+
+	lock_sock(sk);
+	if (sk->sk_state == TCP_LISTEN) {
+		/* Destroy the listen queue */
+		struct sock *sknode;
+		struct hlist_node *p, *n;
+
+		sk_for_each_safe(sknode, p, n, &pn->ackq)
+			sk_del_node_init(sknode);
+		sk->sk_state = TCP_CLOSE;
+	}
+	release_sock(sk);
+}
+
+static int pep_wait_connreq(struct sock *sk, int noblock)
+{
+	struct task_struct *tsk = current;
+	struct pep_sock *pn = pep_sk(sk);
+	long timeo = sock_rcvtimeo(sk, noblock);
+
+	for (;;) {
+		DEFINE_WAIT(wait);
+
+		if (sk->sk_state != TCP_LISTEN)
+			return -EINVAL;
+		if (!hlist_empty(&pn->ackq))
+			break;
+		if (!timeo)
+			return -EWOULDBLOCK;
+		if (signal_pending(tsk))
+			return sock_intr_errno(timeo);
+
+		prepare_to_wait_exclusive(&sk->sk_socket->wait, &wait,
+						TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		finish_wait(&sk->sk_socket->wait, &wait);
+	}
+
+	return 0;
+}
+
+static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sock *newsk = NULL;
+	struct sk_buff *oskb;
+	int err;
+
+	lock_sock(sk);
+	err = pep_wait_connreq(sk, flags & O_NONBLOCK);
+	if (err)
+		goto out;
+
+	newsk = __sk_head(&pn->ackq);
+
+	oskb = skb_dequeue(&newsk->sk_receive_queue);
+	err = pep_accept_conn(newsk, oskb);
+	if (err) {
+		skb_queue_head(&newsk->sk_receive_queue, oskb);
+		newsk = NULL;
+		goto out;
+	}
+
+	sock_hold(sk);
+	pep_sk(newsk)->listener = sk;
+
+	sock_hold(newsk);
+	sk_del_node_init(newsk);
+	sk_acceptq_removed(sk);
+	sk_add_node(newsk, &pn->hlist);
+	__sock_put(newsk);
+
+out:
+	release_sock(sk);
+	*errp = err;
+	return newsk;
+}
+
+static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	int answ;
+
+	switch (cmd) {
+	case SIOCINQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		lock_sock(sk);
+		if (!skb_queue_empty(&sk->sk_receive_queue))
+			answ = skb_peek(&sk->sk_receive_queue)->len;
+		else
+			answ = 0;
+		release_sock(sk);
+		return put_user(answ, (int __user *)arg);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static int pep_init(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	INIT_HLIST_HEAD(&pn->ackq);
+	INIT_HLIST_HEAD(&pn->hlist);
+	pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+	return 0;
+}
+
+static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sk_buff *skb = NULL;
+	struct pnpipehdr *ph;
+	long timeo;
+	int flags = msg->msg_flags;
+	int err, done;
+
+	if (msg->msg_flags & MSG_OOB || !(msg->msg_flags & MSG_EOR))
+		return -EOPNOTSUPP;
+
+	lock_sock(sk);
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	if ((1 << sk->sk_state) & (TCPF_LISTEN|TCPF_CLOSE)) {
+		err = -ENOTCONN;
+		goto out;
+	}
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		/* Wait until the pipe gets to enabled state */
+disabled:
+		err = sk_stream_wait_connect(sk, &timeo);
+		if (err)
+			goto out;
+
+		if (sk->sk_state == TCP_CLOSE_WAIT) {
+			err = -ECONNRESET;
+			goto out;
+		}
+	}
+	BUG_ON(sk->sk_state != TCP_ESTABLISHED);
+
+	/* Wait until flow control allows TX */
+	done = pn->tx_credits > 0;
+	while (!done) {
+		DEFINE_WAIT(wait);
+
+		if (!timeo) {
+			err = -EAGAIN;
+			goto out;
+		}
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			goto out;
+		}
+
+		prepare_to_wait(&sk->sk_socket->wait, &wait,
+				TASK_INTERRUPTIBLE);
+		done = sk_wait_event(sk, &timeo, pn->tx_credits > 0);
+		finish_wait(&sk->sk_socket->wait, &wait);
+
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto disabled;
+	}
+
+	if (!skb) {
+		skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len,
+						flags & MSG_DONTWAIT, &err);
+		if (skb == NULL)
+			goto out;
+		skb_reserve(skb, MAX_PHONET_HEADER + 3);
+
+		if (sk->sk_state != TCP_ESTABLISHED || !pn->tx_credits)
+			goto disabled; /* sock_alloc_send_skb might sleep */
+	}
+
+	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (err < 0)
+		goto out;
+
+	__skb_push(skb, 3);
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = 0;
+	ph->message_id = PNS_PIPE_DATA;
+	ph->pipe_handle = pn->pipe_handle;
+	if (pn_flow_safe(pn->tx_fc)) /* credit-based flow control */
+		pn->tx_credits--;
+
+	err = pn_skb_send(sk, skb, &pipe_srv);
+	if (err >= 0)
+		err = len; /* success! */
+	skb = NULL;
+out:
+	release_sock(sk);
+	kfree_skb(skb);
+	return err;
+}
+
+static int pep_recvmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len, int noblock,
+			int flags, int *addr_len)
+{
+	struct sk_buff *skb;
+	int err;
+
+	if (unlikely(flags & MSG_OOB))
+		return -EOPNOTSUPP;
+	if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE)))
+		return -ENOTCONN;
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	lock_sock(sk);
+	if (skb == NULL) {
+		if (err == -ENOTCONN && sk->sk_state == TCP_CLOSE_WAIT)
+			err = -ECONNRESET;
+		release_sock(sk);
+		return err;
+	}
+
+	if (sk->sk_state == TCP_ESTABLISHED)
+		pipe_grant_credits(sk);
+	release_sock(sk);
+
+	msg->msg_flags |= MSG_EOR;
+
+	if (skb->len > len)
+		msg->msg_flags |= MSG_TRUNC;
+	else
+		len = skb->len;
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	if (!err)
+		err = (flags & MSG_TRUNC) ? skb->len : len;
+
+	skb_free_datagram(sk, skb);
+	return err;
+}
+
+static void pep_sock_unhash(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sock *skparent = NULL;
+
+	lock_sock(sk);
+	if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN)) {
+		skparent = pn->listener;
+		sk_del_node_init(sk);
+		release_sock(sk);
+
+		sk = skparent;
+		pn = pep_sk(skparent);
+		lock_sock(sk);
+	}
+	/* Unhash a listening sock only when it is closed
+	 * and all of its active connected pipes are closed. */
+	if (hlist_empty(&pn->hlist))
+		pn_sock_unhash(&pn->pn_sk.sk);
+	release_sock(sk);
+
+	if (skparent)
+		sock_put(skparent);
+}
+
+static struct proto pep_proto = {
+	.close		= pep_sock_close,
+	.accept		= pep_sock_accept,
+	.ioctl		= pep_ioctl,
+	.init		= pep_init,
+	.sendmsg	= pep_sendmsg,
+	.recvmsg	= pep_recvmsg,
+	.backlog_rcv	= pep_do_rcv,
+	.hash		= pn_sock_hash,
+	.unhash		= pep_sock_unhash,
+	.get_port	= pn_sock_get_port,
+	.obj_size	= sizeof(struct pep_sock),
+	.owner		= THIS_MODULE,
+	.name		= "PNPIPE",
+};
+
+static struct phonet_protocol pep_pn_proto = {
+	.ops		= &phonet_stream_ops,
+	.prot		= &pep_proto,
+	.sock_type	= SOCK_SEQPACKET,
+};
+
+static int __init pep_register(void)
+{
+	return phonet_proto_register(PN_PROTO_PIPE, &pep_pn_proto);
+}
+
+static void __exit pep_unregister(void)
+{
+	phonet_proto_unregister(PN_PROTO_PIPE, &pep_pn_proto);
+}
+
+module_init(pep_register);
+module_exit(pep_unregister);
+MODULE_AUTHOR("Remi Denis-Courmont, Nokia");
+MODULE_DESCRIPTION("Phonet pipe protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_PHONET, PN_PROTO_PIPE);
-- 
cgit v1.2.3


From c41bd97f815720f9404f97da0c4f4400b52c243d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi.denis-courmont@nokia.com>
Date: Sun, 5 Oct 2008 11:15:43 -0700
Subject: Phonet: receive pipe control requests as out-of-band data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/pep.c    | 65 ++++++++++++++++++++++++++++++++++++++---------------
 net/phonet/socket.c |  4 +++-
 2 files changed, 50 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index c5dfecb207d..d564d07cb79 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -137,14 +137,15 @@ static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code)
 
 /* Control requests are not sent by the pipe service and have a specific
  * message format. */
-static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code)
+static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
+				gfp_t priority)
 {
 	const struct pnpipehdr *oph = pnp_hdr(oskb);
 	struct sk_buff *skb;
 	struct pnpipehdr *ph;
 	struct sockaddr_pn dst;
 
-	skb = alloc_skb(MAX_PNPIPE_HEADER + 4, GFP_ATOMIC);
+	skb = alloc_skb(MAX_PNPIPE_HEADER + 4, priority);
 	if (!skb)
 		return -ENOMEM;
 	skb_set_owner_w(skb, sk);
@@ -305,6 +306,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	struct pep_sock *pn = pep_sk(sk);
 	struct pnpipehdr *hdr = pnp_hdr(skb);
+	struct sk_buff_head *queue;
 	int err = 0;
 
 	BUG_ON(sk->sk_state == TCP_CLOSE_WAIT);
@@ -345,9 +347,11 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
 		break;
 
 	case PNS_PEP_CTRL_REQ:
-		/* TODO */
-		pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR);
-		break;
+		if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX)
+			break;
+		__skb_pull(skb, 4);
+		queue = &pn->ctrlreq_queue;
+		goto queue;
 
 	case PNS_PIPE_DATA:
 		__skb_pull(skb, 3); /* Pipe data header */
@@ -363,13 +367,8 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
 			break;
 		}
 		pn->rx_credits--;
-		skb->dev = NULL;
-		skb_set_owner_r(skb, sk);
-		err = skb->len;
-		skb_queue_tail(&sk->sk_receive_queue, skb);
-		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_data_ready(sk, err);
-		return 0;
+		queue = &sk->sk_receive_queue;
+		goto queue;
 
 	case PNS_PEP_STATUS_IND:
 		pipe_rcv_status(sk, skb);
@@ -412,12 +411,24 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
 out:
 	kfree_skb(skb);
 	return err;
+
+queue:
+	skb->dev = NULL;
+	skb_set_owner_r(skb, sk);
+	err = skb->len;
+	skb_queue_tail(queue, skb);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, err);
+	return 0;
 }
 
 /* Destroy connected sock. */
 static void pipe_destruct(struct sock *sk)
 {
+	struct pep_sock *pn = pep_sk(sk);
+
 	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&pn->ctrlreq_queue);
 }
 
 static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb)
@@ -490,6 +501,7 @@ static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb)
 	pn_skb_get_dst_sockaddr(skb, &dst);
 	newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst);
 	newpn->pn_sk.resource = pn->pn_sk.resource;
+	skb_queue_head_init(&newpn->ctrlreq_queue);
 	newpn->pipe_handle = pipe_handle;
 	newpn->peer_type = peer_type;
 	newpn->rx_credits = newpn->tx_credits = 0;
@@ -581,7 +593,7 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
 		break;
 
 	case PNS_PEP_CTRL_REQ:
-		pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE);
+		pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE, GFP_ATOMIC);
 		break;
 
 	case PNS_PEP_RESET_REQ:
@@ -684,6 +696,7 @@ out:
 
 static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
+	struct pep_sock *pn = pep_sk(sk);
 	int answ;
 
 	switch (cmd) {
@@ -692,7 +705,10 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			return -EINVAL;
 
 		lock_sock(sk);
-		if (!skb_queue_empty(&sk->sk_receive_queue))
+		if (sock_flag(sk, SOCK_URGINLINE)
+		 && !skb_queue_empty(&pn->ctrlreq_queue))
+			answ = skb_peek(&pn->ctrlreq_queue)->len;
+		else if (!skb_queue_empty(&sk->sk_receive_queue))
 			answ = skb_peek(&sk->sk_receive_queue)->len;
 		else
 			answ = 0;
@@ -709,6 +725,7 @@ static int pep_init(struct sock *sk)
 
 	INIT_HLIST_HEAD(&pn->ackq);
 	INIT_HLIST_HEAD(&pn->hlist);
+	skb_queue_head_init(&pn->ctrlreq_queue);
 	pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
 	return 0;
 }
@@ -810,11 +827,24 @@ static int pep_recvmsg(struct kiocb *iocb, struct sock *sk,
 	struct sk_buff *skb;
 	int err;
 
-	if (unlikely(flags & MSG_OOB))
-		return -EOPNOTSUPP;
 	if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE)))
 		return -ENOTCONN;
 
+	if ((flags & MSG_OOB) || sock_flag(sk, SOCK_URGINLINE)) {
+		/* Dequeue and acknowledge control request */
+		struct pep_sock *pn = pep_sk(sk);
+
+		skb = skb_dequeue(&pn->ctrlreq_queue);
+		if (skb) {
+			pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR,
+						GFP_KERNEL);
+			msg->msg_flags |= MSG_OOB;
+			goto copy;
+		}
+		if (flags & MSG_OOB)
+			return -EINVAL;
+	}
+
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	lock_sock(sk);
 	if (skb == NULL) {
@@ -827,9 +857,8 @@ static int pep_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (sk->sk_state == TCP_ESTABLISHED)
 		pipe_grant_credits(sk);
 	release_sock(sk);
-
+copy:
 	msg->msg_flags |= MSG_EOR;
-
 	if (skb->len > len)
 		msg->msg_flags |= MSG_TRUNC;
 	else
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index cea1136cbe4..a9c3d1f2e9d 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -220,7 +220,9 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
 
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
-	else if (sk->sk_state == TCP_CLOSE_WAIT)
+	if (!skb_queue_empty(&pn->ctrlreq_queue))
+		mask |= POLLPRI;
+	if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
 		return POLLHUP;
 
 	if (sk->sk_state == TCP_ESTABLISHED && pn->tx_credits)
-- 
cgit v1.2.3


From 02a47617cdce440f60c71a51f3a93f9f5fcc5a7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi.denis-courmont@nokia.com>
Date: Sun, 5 Oct 2008 11:16:16 -0700
Subject: Phonet: implement GPRS virtual interface over PEP socket
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/Makefile   |   2 +-
 net/phonet/pep-gprs.c | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/phonet/pep.c      | 161 +++++++++++++++++++++--
 net/phonet/socket.c   |   8 +-
 4 files changed, 502 insertions(+), 16 deletions(-)
 create mode 100644 net/phonet/pep-gprs.c

(limited to 'net')

diff --git a/net/phonet/Makefile b/net/phonet/Makefile
index 505df2a04a8..d62bbba649b 100644
--- a/net/phonet/Makefile
+++ b/net/phonet/Makefile
@@ -8,4 +8,4 @@ phonet-objs := \
 	sysctl.o \
 	af_phonet.o
 
-pn_pep-objs := pep.o
+pn_pep-objs := pep.o pep-gprs.o
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
new file mode 100644
index 00000000000..9978afbd9f2
--- /dev/null
+++ b/net/phonet/pep-gprs.c
@@ -0,0 +1,347 @@
+/*
+ * File: pep-gprs.c
+ *
+ * GPRS over Phonet pipe end point socket
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Author: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <net/sock.h>
+
+#include <linux/if_phonet.h>
+#include <net/tcp_states.h>
+#include <net/phonet/gprs.h>
+
+#define GPRS_DEFAULT_MTU 1400
+
+struct gprs_dev {
+	struct sock		*sk;
+	void			(*old_state_change)(struct sock *);
+	void			(*old_data_ready)(struct sock *, int);
+	void			(*old_write_space)(struct sock *);
+
+	struct net_device	*net;
+	struct net_device_stats	stats;
+
+	struct sk_buff_head	tx_queue;
+	struct work_struct	tx_work;
+	spinlock_t		tx_lock;
+	unsigned		tx_max;
+};
+
+static int gprs_type_trans(struct sk_buff *skb)
+{
+	const u8 *pvfc;
+	u8 buf;
+
+	pvfc = skb_header_pointer(skb, 0, 1, &buf);
+	if (!pvfc)
+		return 0;
+	/* Look at IP version field */
+	switch (*pvfc >> 4) {
+	case 4:
+		return htons(ETH_P_IP);
+	case 6:
+		return htons(ETH_P_IPV6);
+	}
+	return 0;
+}
+
+/*
+ * Socket callbacks
+ */
+
+static void gprs_state_change(struct sock *sk)
+{
+	struct gprs_dev *dev = sk->sk_user_data;
+
+	if (sk->sk_state == TCP_CLOSE_WAIT) {
+		netif_stop_queue(dev->net);
+		netif_carrier_off(dev->net);
+	}
+}
+
+static int gprs_recv(struct gprs_dev *dev, struct sk_buff *skb)
+{
+	int err = 0;
+	u16 protocol = gprs_type_trans(skb);
+
+	if (!protocol) {
+		err = -EINVAL;
+		goto drop;
+	}
+
+	if (likely(skb_headroom(skb) & 3)) {
+		struct sk_buff *rskb, *fs;
+		int flen = 0;
+
+		/* Phonet Pipe data header is misaligned (3 bytes),
+		 * so wrap the IP packet as a single fragment of an head-less
+		 * socket buffer. The network stack will pull what it needs,
+		 * but at least, the whole IP payload is not memcpy'd. */
+		rskb = netdev_alloc_skb(dev->net, 0);
+		if (!rskb) {
+			err = -ENOBUFS;
+			goto drop;
+		}
+		skb_shinfo(rskb)->frag_list = skb;
+		rskb->len += skb->len;
+		rskb->data_len += rskb->len;
+		rskb->truesize += rskb->len;
+
+		/* Avoid nested fragments */
+		for (fs = skb_shinfo(skb)->frag_list; fs; fs = fs->next)
+			flen += fs->len;
+		skb->next = skb_shinfo(skb)->frag_list;
+		skb_shinfo(skb)->frag_list = NULL;
+		skb->len -= flen;
+		skb->data_len -= flen;
+		skb->truesize -= flen;
+
+		skb = rskb;
+	}
+
+	skb->protocol = protocol;
+	skb_reset_mac_header(skb);
+	skb->dev = dev->net;
+
+	if (likely(dev->net->flags & IFF_UP)) {
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += skb->len;
+		netif_rx(skb);
+		skb = NULL;
+	} else
+		err = -ENODEV;
+
+drop:
+	if (skb) {
+		dev_kfree_skb(skb);
+		dev->stats.rx_dropped++;
+	}
+	return err;
+}
+
+static void gprs_data_ready(struct sock *sk, int len)
+{
+	struct gprs_dev *dev = sk->sk_user_data;
+	struct sk_buff *skb;
+
+	while ((skb = pep_read(sk)) != NULL) {
+		skb_orphan(skb);
+		gprs_recv(dev, skb);
+	}
+}
+
+static void gprs_write_space(struct sock *sk)
+{
+	struct gprs_dev *dev = sk->sk_user_data;
+	unsigned credits = pep_writeable(sk);
+
+	spin_lock_bh(&dev->tx_lock);
+	dev->tx_max = credits;
+	if (credits > skb_queue_len(&dev->tx_queue))
+		netif_wake_queue(dev->net);
+	spin_unlock_bh(&dev->tx_lock);
+}
+
+/*
+ * Network device callbacks
+ */
+
+static int gprs_xmit(struct sk_buff *skb, struct net_device *net)
+{
+	struct gprs_dev *dev = netdev_priv(net);
+
+	switch (skb->protocol) {
+	case  htons(ETH_P_IP):
+	case  htons(ETH_P_IPV6):
+		break;
+	default:
+		dev_kfree_skb(skb);
+		return 0;
+	}
+
+	spin_lock(&dev->tx_lock);
+	if (likely(skb_queue_len(&dev->tx_queue) < dev->tx_max)) {
+		skb_queue_tail(&dev->tx_queue, skb);
+		skb = NULL;
+	}
+	if (skb_queue_len(&dev->tx_queue) >= dev->tx_max)
+		netif_stop_queue(net);
+	spin_unlock(&dev->tx_lock);
+
+	schedule_work(&dev->tx_work);
+	if (unlikely(skb))
+		dev_kfree_skb(skb);
+	return 0;
+}
+
+static void gprs_tx(struct work_struct *work)
+{
+	struct gprs_dev *dev = container_of(work, struct gprs_dev, tx_work);
+	struct sock *sk = dev->sk;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&dev->tx_queue)) != NULL) {
+		int err;
+
+		dev->stats.tx_bytes += skb->len;
+		dev->stats.tx_packets++;
+
+		skb_orphan(skb);
+		skb_set_owner_w(skb, sk);
+
+		lock_sock(sk);
+		err = pep_write(sk, skb);
+		if (err) {
+			LIMIT_NETDEBUG(KERN_WARNING"%s: TX error (%d)\n",
+					dev->net->name, err);
+			dev->stats.tx_aborted_errors++;
+			dev->stats.tx_errors++;
+		}
+		release_sock(sk);
+	}
+
+	lock_sock(sk);
+	gprs_write_space(sk);
+	release_sock(sk);
+}
+
+static int gprs_set_mtu(struct net_device *net, int new_mtu)
+{
+	if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11)))
+		return -EINVAL;
+
+	net->mtu = new_mtu;
+	return 0;
+}
+
+static struct net_device_stats *gprs_get_stats(struct net_device *net)
+{
+	struct gprs_dev *dev = netdev_priv(net);
+
+	return &dev->stats;
+}
+
+static void gprs_setup(struct net_device *net)
+{
+	net->features		= NETIF_F_FRAGLIST;
+	net->type		= ARPHRD_NONE;
+	net->flags		= IFF_POINTOPOINT | IFF_NOARP;
+	net->mtu		= GPRS_DEFAULT_MTU;
+	net->hard_header_len	= 0;
+	net->addr_len		= 0;
+	net->tx_queue_len	= 10;
+
+	net->destructor		= free_netdev;
+	net->hard_start_xmit	= gprs_xmit; /* mandatory */
+	net->change_mtu		= gprs_set_mtu;
+	net->get_stats		= gprs_get_stats;
+}
+
+/*
+ * External interface
+ */
+
+/*
+ * Attach a GPRS interface to a datagram socket.
+ * Returns the interface index on success, negative error code on error.
+ */
+int gprs_attach(struct sock *sk)
+{
+	static const char ifname[] = "gprs%d";
+	struct gprs_dev *dev;
+	struct net_device *net;
+	int err;
+
+	if (unlikely(sk->sk_type == SOCK_STREAM))
+		return -EINVAL; /* need packet boundaries */
+
+	/* Create net device */
+	net = alloc_netdev(sizeof(*dev), ifname, gprs_setup);
+	if (!net)
+		return -ENOMEM;
+	dev = netdev_priv(net);
+	dev->net = net;
+	dev->tx_max = 0;
+	spin_lock_init(&dev->tx_lock);
+	skb_queue_head_init(&dev->tx_queue);
+	INIT_WORK(&dev->tx_work, gprs_tx);
+
+	netif_stop_queue(net);
+	err = register_netdev(net);
+	if (err) {
+		free_netdev(net);
+		return err;
+	}
+
+	lock_sock(sk);
+	if (unlikely(sk->sk_user_data)) {
+		err = -EBUSY;
+		goto out_rel;
+	}
+	if (unlikely((1 << sk->sk_state & (TCPF_CLOSE|TCPF_LISTEN)) ||
+			sock_flag(sk, SOCK_DEAD))) {
+		err = -EINVAL;
+		goto out_rel;
+	}
+	sk->sk_user_data	= dev;
+	dev->old_state_change	= sk->sk_state_change;
+	dev->old_data_ready	= sk->sk_data_ready;
+	dev->old_write_space	= sk->sk_write_space;
+	sk->sk_state_change	= gprs_state_change;
+	sk->sk_data_ready	= gprs_data_ready;
+	sk->sk_write_space	= gprs_write_space;
+	release_sock(sk);
+
+	sock_hold(sk);
+	dev->sk = sk;
+
+	printk(KERN_DEBUG"%s: attached\n", net->name);
+	gprs_write_space(sk); /* kick off TX */
+	return net->ifindex;
+
+out_rel:
+	release_sock(sk);
+	unregister_netdev(net);
+	return err;
+}
+
+void gprs_detach(struct sock *sk)
+{
+	struct gprs_dev *dev = sk->sk_user_data;
+	struct net_device *net = dev->net;
+
+	lock_sock(sk);
+	sk->sk_user_data	= NULL;
+	sk->sk_state_change	= dev->old_state_change;
+	sk->sk_data_ready	= dev->old_data_ready;
+	sk->sk_write_space	= dev->old_write_space;
+	release_sock(sk);
+
+	printk(KERN_DEBUG"%s: detached\n", net->name);
+	unregister_netdev(net);
+	flush_scheduled_work();
+	sock_put(sk);
+	skb_queue_purge(&dev->tx_queue);
+}
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index d564d07cb79..bc6d50f8324 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -31,6 +31,7 @@
 #include <linux/phonet.h>
 #include <net/phonet/phonet.h>
 #include <net/phonet/pep.h>
+#include <net/phonet/gprs.h>
 
 /* sk_state values:
  * TCP_CLOSE		sock not in use yet
@@ -612,6 +613,7 @@ drop:
 static void pep_sock_close(struct sock *sk, long timeout)
 {
 	struct pep_sock *pn = pep_sk(sk);
+	int ifindex = 0;
 
 	sk_common_release(sk);
 
@@ -625,7 +627,12 @@ static void pep_sock_close(struct sock *sk, long timeout)
 			sk_del_node_init(sknode);
 		sk->sk_state = TCP_CLOSE;
 	}
+	ifindex = pn->ifindex;
+	pn->ifindex = 0;
 	release_sock(sk);
+
+	if (ifindex)
+		gprs_detach(sk);
 }
 
 static int pep_wait_connreq(struct sock *sk, int noblock)
@@ -730,12 +737,107 @@ static int pep_init(struct sock *sk)
 	return 0;
 }
 
+static int pep_setsockopt(struct sock *sk, int level, int optname,
+				char __user *optval, int optlen)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int val = 0, err = 0;
+
+	if (level != SOL_PNPIPE)
+		return -ENOPROTOOPT;
+	if (optlen >= sizeof(int)) {
+		if (get_user(val, (int __user *) optval))
+			return -EFAULT;
+	}
+
+	lock_sock(sk);
+	switch (optname) {
+	case PNPIPE_ENCAP:
+		if (val && val != PNPIPE_ENCAP_IP) {
+			err = -EINVAL;
+			break;
+		}
+		if (!pn->ifindex == !val)
+			break; /* Nothing to do! */
+		if (!capable(CAP_NET_ADMIN)) {
+			err = -EPERM;
+			break;
+		}
+		if (val) {
+			release_sock(sk);
+			err = gprs_attach(sk);
+			if (err > 0) {
+				pn->ifindex = err;
+				err = 0;
+			}
+		} else {
+			pn->ifindex = 0;
+			release_sock(sk);
+			gprs_detach(sk);
+			err = 0;
+		}
+		goto out_norel;
+	default:
+		err = -ENOPROTOOPT;
+	}
+	release_sock(sk);
+
+out_norel:
+	return err;
+}
+
+static int pep_getsockopt(struct sock *sk, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int len, val;
+
+	if (level != SOL_PNPIPE)
+		return -ENOPROTOOPT;
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	switch (optname) {
+	case PNPIPE_ENCAP:
+		val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
+		break;
+	case PNPIPE_IFINDEX:
+		val = pn->ifindex;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	len = min_t(unsigned int, sizeof(int), len);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (put_user(val, (int __user *) optval))
+		return -EFAULT;
+	return 0;
+}
+
+static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *ph;
+
+	skb_push(skb, 3);
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = 0;
+	ph->message_id = PNS_PIPE_DATA;
+	ph->pipe_handle = pn->pipe_handle;
+	if (pn_flow_safe(pn->tx_fc) && pn->tx_credits)
+		pn->tx_credits--;
+
+	return pn_skb_send(sk, skb, &pipe_srv);
+}
+
 static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
 			struct msghdr *msg, size_t len)
 {
 	struct pep_sock *pn = pep_sk(sk);
 	struct sk_buff *skb = NULL;
-	struct pnpipehdr *ph;
 	long timeo;
 	int flags = msg->msg_flags;
 	int err, done;
@@ -801,16 +903,7 @@ disabled:
 	if (err < 0)
 		goto out;
 
-	__skb_push(skb, 3);
-	skb_reset_transport_header(skb);
-	ph = pnp_hdr(skb);
-	ph->utid = 0;
-	ph->message_id = PNS_PIPE_DATA;
-	ph->pipe_handle = pn->pipe_handle;
-	if (pn_flow_safe(pn->tx_fc)) /* credit-based flow control */
-		pn->tx_credits--;
-
-	err = pn_skb_send(sk, skb, &pipe_srv);
+	err = pipe_skb_send(sk, skb);
 	if (err >= 0)
 		err = len; /* success! */
 	skb = NULL;
@@ -820,6 +913,50 @@ out:
 	return err;
 }
 
+int pep_writeable(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	return (sk->sk_state == TCP_ESTABLISHED) ? pn->tx_credits : 0;
+}
+
+int pep_write(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *rskb, *fs;
+	int flen = 0;
+
+	rskb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
+	if (!rskb) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+	skb_shinfo(rskb)->frag_list = skb;
+	rskb->len += skb->len;
+	rskb->data_len += rskb->len;
+	rskb->truesize += rskb->len;
+
+	/* Avoid nested fragments */
+	for (fs = skb_shinfo(skb)->frag_list; fs; fs = fs->next)
+		flen += fs->len;
+	skb->next = skb_shinfo(skb)->frag_list;
+	skb_shinfo(skb)->frag_list = NULL;
+	skb->len -= flen;
+	skb->data_len -= flen;
+	skb->truesize -= flen;
+
+	skb_reserve(rskb, MAX_PHONET_HEADER + 3);
+	return pipe_skb_send(sk, rskb);
+}
+
+struct sk_buff *pep_read(struct sock *sk)
+{
+	struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue);
+
+	if (sk->sk_state == TCP_ESTABLISHED)
+		pipe_grant_credits(sk);
+	return skb;
+}
+
 static int pep_recvmsg(struct kiocb *iocb, struct sock *sk,
 			struct msghdr *msg, size_t len, int noblock,
 			int flags, int *addr_len)
@@ -902,6 +1039,8 @@ static struct proto pep_proto = {
 	.accept		= pep_sock_accept,
 	.ioctl		= pep_ioctl,
 	.init		= pep_init,
+	.setsockopt	= pep_setsockopt,
+	.getsockopt	= pep_getsockopt,
 	.sendmsg	= pep_sendmsg,
 	.recvmsg	= pep_recvmsg,
 	.backlog_rcv	= pep_do_rcv,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index a9c3d1f2e9d..d81740187fb 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -342,11 +342,11 @@ const struct proto_ops phonet_stream_ops = {
 	.ioctl		= pn_socket_ioctl,
 	.listen		= pn_socket_listen,
 	.shutdown	= sock_no_shutdown,
-	.setsockopt	= sock_no_setsockopt,
-	.getsockopt	= sock_no_getsockopt,
+	.setsockopt	= sock_common_setsockopt,
+	.getsockopt	= sock_common_getsockopt,
 #ifdef CONFIG_COMPAT
-	.compat_setsockopt = sock_no_setsockopt,
-	.compat_getsockopt = compat_sock_no_getsockopt,
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
 	.sendmsg	= pn_socket_sendmsg,
 	.recvmsg	= sock_common_recvmsg,
-- 
cgit v1.2.3


From 13c1d18931ebb5cf407cb348ef2cd6284d68902d Mon Sep 17 00:00:00 2001
From: Arnaud Ebalard <arno@natisbad.org>
Date: Sun, 5 Oct 2008 13:33:42 -0700
Subject: xfrm: MIGRATE enhancements
 (draft-ebalard-mext-pfkey-enhanced-migrate)

Provides implementation of the enhancements of XFRM/PF_KEY MIGRATE mechanism
specified in draft-ebalard-mext-pfkey-enhanced-migrate-00. Defines associated
PF_KEY SADB_X_EXT_KMADDRESS extension and XFRM/netlink XFRMA_KMADDRESS
attribute.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c       | 86 ++++++++++++++++++++++++++++++++++++++++----------
 net/xfrm/xfrm_policy.c |  5 +--
 net/xfrm/xfrm_state.c  |  5 +--
 net/xfrm/xfrm_user.c   | 57 ++++++++++++++++++++++++++-------
 4 files changed, 120 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 7ae641df70b..362fe317e1f 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -398,6 +398,7 @@ static u8 sadb_ext_min_len[] = {
 	[SADB_X_EXT_NAT_T_DPORT]	= (u8) sizeof(struct sadb_x_nat_t_port),
 	[SADB_X_EXT_NAT_T_OA]		= (u8) sizeof(struct sadb_address),
 	[SADB_X_EXT_SEC_CTX]		= (u8) sizeof(struct sadb_x_sec_ctx),
+	[SADB_X_EXT_KMADDRESS]		= (u8) sizeof(struct sadb_x_kmaddress),
 };
 
 /* Verify sadb_address_{len,prefixlen} against sa_family.  */
@@ -2384,24 +2385,21 @@ static int pfkey_sockaddr_pair_size(sa_family_t family)
 	return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
 }
 
-static int parse_sockaddr_pair(struct sadb_x_ipsecrequest *rq,
+static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
 			       xfrm_address_t *saddr, xfrm_address_t *daddr,
 			       u16 *family)
 {
-	u8 *sa = (u8 *) (rq + 1);
 	int af, socklen;
 
-	if (rq->sadb_x_ipsecrequest_len <
-	    pfkey_sockaddr_pair_size(((struct sockaddr *)sa)->sa_family))
+	if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
 		return -EINVAL;
 
-	af = pfkey_sockaddr_extract((struct sockaddr *) sa,
-				    saddr);
+	af = pfkey_sockaddr_extract(sa, saddr);
 	if (!af)
 		return -EINVAL;
 
 	socklen = pfkey_sockaddr_len(af);
-	if (pfkey_sockaddr_extract((struct sockaddr *) (sa + socklen),
+	if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen),
 				   daddr) != af)
 		return -EINVAL;
 
@@ -2421,7 +2419,9 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
 		return -EINVAL;
 
 	/* old endoints */
-	err = parse_sockaddr_pair(rq1, &m->old_saddr, &m->old_daddr,
+	err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
+				  rq1->sadb_x_ipsecrequest_len,
+				  &m->old_saddr, &m->old_daddr,
 				  &m->old_family);
 	if (err)
 		return err;
@@ -2434,7 +2434,9 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
 		return -EINVAL;
 
 	/* new endpoints */
-	err = parse_sockaddr_pair(rq2, &m->new_saddr, &m->new_daddr,
+	err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
+				  rq2->sadb_x_ipsecrequest_len,
+				  &m->new_saddr, &m->new_daddr,
 				  &m->new_family);
 	if (err)
 		return err;
@@ -2460,29 +2462,40 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
 	int i, len, ret, err = -EINVAL;
 	u8 dir;
 	struct sadb_address *sa;
+	struct sadb_x_kmaddress *kma;
 	struct sadb_x_policy *pol;
 	struct sadb_x_ipsecrequest *rq;
 	struct xfrm_selector sel;
 	struct xfrm_migrate m[XFRM_MAX_DEPTH];
+	struct xfrm_kmaddress k;
 
 	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1],
-	    ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) ||
+				     ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) ||
 	    !ext_hdrs[SADB_X_EXT_POLICY - 1]) {
 		err = -EINVAL;
 		goto out;
 	}
 
+	kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1];
 	pol = ext_hdrs[SADB_X_EXT_POLICY - 1];
-	if (!pol) {
-		err = -EINVAL;
-		goto out;
-	}
 
 	if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) {
 		err = -EINVAL;
 		goto out;
 	}
 
+	if (kma) {
+		/* convert sadb_x_kmaddress to xfrm_kmaddress */
+		k.reserved = kma->sadb_x_kmaddress_reserved;
+		ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1),
+					  8*(kma->sadb_x_kmaddress_len) - sizeof(*kma),
+					  &k.local, &k.remote, &k.family);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+	}
+
 	dir = pol->sadb_x_policy_dir - 1;
 	memset(&sel, 0, sizeof(sel));
 
@@ -2527,7 +2540,8 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
 		goto out;
 	}
 
-	return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i);
+	return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i,
+			    kma ? &k : NULL);
 
  out:
 	return err;
@@ -3319,6 +3333,32 @@ static int set_sadb_address(struct sk_buff *skb, int sasize, int type,
 	return 0;
 }
 
+
+static int set_sadb_kmaddress(struct sk_buff *skb, struct xfrm_kmaddress *k)
+{
+	struct sadb_x_kmaddress *kma;
+	u8 *sa;
+	int family = k->family;
+	int socklen = pfkey_sockaddr_len(family);
+	int size_req;
+
+	size_req = (sizeof(struct sadb_x_kmaddress) +
+		    pfkey_sockaddr_pair_size(family));
+
+	kma = (struct sadb_x_kmaddress *)skb_put(skb, size_req);
+	memset(kma, 0, size_req);
+	kma->sadb_x_kmaddress_len = size_req / 8;
+	kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS;
+	kma->sadb_x_kmaddress_reserved = k->reserved;
+
+	sa = (u8 *)(kma + 1);
+	if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) ||
+	    !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int set_ipsecrequest(struct sk_buff *skb,
 			    uint8_t proto, uint8_t mode, int level,
 			    uint32_t reqid, uint8_t family,
@@ -3351,7 +3391,8 @@ static int set_ipsecrequest(struct sk_buff *skb,
 
 #ifdef CONFIG_NET_KEY_MIGRATE
 static int pfkey_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
-			      struct xfrm_migrate *m, int num_bundles)
+			      struct xfrm_migrate *m, int num_bundles,
+			      struct xfrm_kmaddress *k)
 {
 	int i;
 	int sasize_sel;
@@ -3368,6 +3409,12 @@ static int pfkey_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
 	if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH)
 		return -EINVAL;
 
+	if (k != NULL) {
+		/* addresses for KM */
+		size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) +
+				     pfkey_sockaddr_pair_size(k->family));
+	}
+
 	/* selector */
 	sasize_sel = pfkey_sockaddr_size(sel->family);
 	if (!sasize_sel)
@@ -3404,6 +3451,10 @@ static int pfkey_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
 	hdr->sadb_msg_seq = 0;
 	hdr->sadb_msg_pid = 0;
 
+	/* Addresses to be used by KM for negotiation, if ext is available */
+	if (k != NULL && (set_sadb_kmaddress(skb, k) < 0))
+		return -EINVAL;
+
 	/* selector src */
 	set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel);
 
@@ -3449,7 +3500,8 @@ err:
 }
 #else
 static int pfkey_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
-			      struct xfrm_migrate *m, int num_bundles)
+			      struct xfrm_migrate *m, int num_bundles,
+			      struct xfrm_kmaddress *k)
 {
 	return -ENOPROTOOPT;
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b7ec08025ff..832b47c1de8 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2679,7 +2679,8 @@ static int xfrm_migrate_check(struct xfrm_migrate *m, int num_migrate)
 }
 
 int xfrm_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
-		 struct xfrm_migrate *m, int num_migrate)
+		 struct xfrm_migrate *m, int num_migrate,
+		 struct xfrm_kmaddress *k)
 {
 	int i, err, nx_cur = 0, nx_new = 0;
 	struct xfrm_policy *pol = NULL;
@@ -2723,7 +2724,7 @@ int xfrm_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
 	}
 
 	/* Stage 5 - announce */
-	km_migrate(sel, dir, type, m, num_migrate);
+	km_migrate(sel, dir, type, m, num_migrate, k);
 
 	xfrm_pol_put(pol);
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 747fd8c291a..508337f9724 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1814,7 +1814,8 @@ EXPORT_SYMBOL(km_policy_expired);
 
 #ifdef CONFIG_XFRM_MIGRATE
 int km_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
-	       struct xfrm_migrate *m, int num_migrate)
+	       struct xfrm_migrate *m, int num_migrate,
+	       struct xfrm_kmaddress *k)
 {
 	int err = -EINVAL;
 	int ret;
@@ -1823,7 +1824,7 @@ int km_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
 	read_lock(&xfrm_km_lock);
 	list_for_each_entry(km, &xfrm_km_list, list) {
 		if (km->migrate) {
-			ret = km->migrate(sel, dir, type, m, num_migrate);
+			ret = km->migrate(sel, dir, type, m, num_migrate, k);
 			if (!ret)
 				err = ret;
 		}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 76f75df21e1..4a8a1abb59e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1710,12 +1710,23 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 #ifdef CONFIG_XFRM_MIGRATE
 static int copy_from_user_migrate(struct xfrm_migrate *ma,
+				  struct xfrm_kmaddress *k,
 				  struct nlattr **attrs, int *num)
 {
 	struct nlattr *rt = attrs[XFRMA_MIGRATE];
 	struct xfrm_user_migrate *um;
 	int i, num_migrate;
 
+	if (k != NULL) {
+		struct xfrm_user_kmaddress *uk;
+
+		uk = nla_data(attrs[XFRMA_KMADDRESS]);
+		memcpy(&k->local, &uk->local, sizeof(k->local));
+		memcpy(&k->remote, &uk->remote, sizeof(k->remote));
+		k->family = uk->family;
+		k->reserved = uk->reserved;
+	}
+
 	um = nla_data(rt);
 	num_migrate = nla_len(rt) / sizeof(*um);
 
@@ -1745,6 +1756,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct xfrm_userpolicy_id *pi = nlmsg_data(nlh);
 	struct xfrm_migrate m[XFRM_MAX_DEPTH];
+	struct xfrm_kmaddress km, *kmp;
 	u8 type;
 	int err;
 	int n = 0;
@@ -1752,19 +1764,20 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (attrs[XFRMA_MIGRATE] == NULL)
 		return -EINVAL;
 
+	kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL;
+
 	err = copy_from_user_policy_type(&type, attrs);
 	if (err)
 		return err;
 
-	err = copy_from_user_migrate((struct xfrm_migrate *)m,
-				     attrs, &n);
+	err = copy_from_user_migrate((struct xfrm_migrate *)m, kmp, attrs, &n);
 	if (err)
 		return err;
 
 	if (!n)
 		return 0;
 
-	xfrm_migrate(&pi->sel, pi->dir, type, m, n);
+	xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp);
 
 	return 0;
 }
@@ -1795,16 +1808,30 @@ static int copy_to_user_migrate(struct xfrm_migrate *m, struct sk_buff *skb)
 	return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um);
 }
 
-static inline size_t xfrm_migrate_msgsize(int num_migrate)
+static int copy_to_user_kmaddress(struct xfrm_kmaddress *k, struct sk_buff *skb)
+{
+	struct xfrm_user_kmaddress uk;
+
+	memset(&uk, 0, sizeof(uk));
+	uk.family = k->family;
+	uk.reserved = k->reserved;
+	memcpy(&uk.local, &k->local, sizeof(uk.local));
+	memcpy(&uk.remote, &k->local, sizeof(uk.remote));
+
+	return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk);
+}
+
+static inline size_t xfrm_migrate_msgsize(int num_migrate, int with_kma)
 {
 	return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id))
-	       + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate)
-	       + userpolicy_type_attrsize();
+	      + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0)
+	      + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate)
+	      + userpolicy_type_attrsize();
 }
 
 static int build_migrate(struct sk_buff *skb, struct xfrm_migrate *m,
-			 int num_migrate, struct xfrm_selector *sel,
-			 u8 dir, u8 type)
+			 int num_migrate, struct xfrm_kmaddress *k,
+			 struct xfrm_selector *sel, u8 dir, u8 type)
 {
 	struct xfrm_migrate *mp;
 	struct xfrm_userpolicy_id *pol_id;
@@ -1821,6 +1848,9 @@ static int build_migrate(struct sk_buff *skb, struct xfrm_migrate *m,
 	memcpy(&pol_id->sel, sel, sizeof(pol_id->sel));
 	pol_id->dir = dir;
 
+	if (k != NULL && (copy_to_user_kmaddress(k, skb) < 0))
+			goto nlmsg_failure;
+
 	if (copy_to_user_policy_type(type, skb) < 0)
 		goto nlmsg_failure;
 
@@ -1836,23 +1866,25 @@ nlmsg_failure:
 }
 
 static int xfrm_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
-			     struct xfrm_migrate *m, int num_migrate)
+			     struct xfrm_migrate *m, int num_migrate,
+			     struct xfrm_kmaddress *k)
 {
 	struct sk_buff *skb;
 
-	skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate), GFP_ATOMIC);
+	skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
 
 	/* build migrate */
-	if (build_migrate(skb, m, num_migrate, sel, dir, type) < 0)
+	if (build_migrate(skb, m, num_migrate, k, sel, dir, type) < 0)
 		BUG();
 
 	return nlmsg_multicast(xfrm_nl, skb, 0, XFRMNLGRP_MIGRATE, GFP_ATOMIC);
 }
 #else
 static int xfrm_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
-			     struct xfrm_migrate *m, int num_migrate)
+			     struct xfrm_migrate *m, int num_migrate,
+			     struct xfrm_kmaddress *k)
 {
 	return -ENOPROTOOPT;
 }
@@ -1901,6 +1933,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_COADDR]		= { .len = sizeof(xfrm_address_t) },
 	[XFRMA_POLICY_TYPE]	= { .len = sizeof(struct xfrm_userpolicy_type)},
 	[XFRMA_MIGRATE]		= { .len = sizeof(struct xfrm_user_migrate) },
+	[XFRMA_KMADDRESS]	= { .len = sizeof(struct xfrm_user_kmaddress) },
 };
 
 static struct xfrm_link {
-- 
cgit v1.2.3


From 554794de7949d1a6279336404c066f974d4c2bde Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Mon, 6 Oct 2008 09:54:39 -0700
Subject: pkt_sched: Fix handling of gso skbs on requeuing

Jay Cliburn noticed and diagnosed a bug triggered in
dev_gso_skb_destructor() after last change from qdisc->gso_skb
to qdisc->requeue list. Since gso_segmented skbs can't be queued
to another list this patch brings back qdisc->gso_skb for them.

Reported-by: Jay Cliburn <jcliburn@gmail.com>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5e7e0bd38fe..3db4cf1bd26 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,7 +44,10 @@ static inline int qdisc_qlen(struct Qdisc *q)
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	__skb_queue_head(&q->requeue, skb);
+	if (unlikely(skb->next))
+		q->gso_skb = skb;
+	else
+		__skb_queue_head(&q->requeue, skb);
 
 	__netif_schedule(q);
 	return 0;
@@ -52,7 +55,10 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 
 static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
-	struct sk_buff *skb = skb_peek(&q->requeue);
+	struct sk_buff *skb = q->gso_skb;
+
+	if (!skb)
+		skb = skb_peek(&q->requeue);
 
 	if (unlikely(skb)) {
 		struct net_device *dev = qdisc_dev(q);
@@ -60,10 +66,15 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 
 		/* check the reason of requeuing without tx lock first */
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
-			__skb_unlink(skb, &q->requeue);
-		else
+		if (!netif_tx_queue_stopped(txq) &&
+		    !netif_tx_queue_frozen(txq)) {
+			if (q->gso_skb)
+				q->gso_skb = NULL;
+			else
+				__skb_unlink(skb, &q->requeue);
+		} else {
 			skb = NULL;
+		}
 	} else {
 		skb = q->dequeue(q);
 	}
@@ -548,6 +559,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
+	kfree_skb(qdisc->gso_skb);
 	__skb_queue_purge(&qdisc->requeue);
 
 	kfree((char *) qdisc - qdisc->padded);
-- 
cgit v1.2.3


From 6252352d16f7b45a0fd42224f7e70e0288dc4480 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Mon, 6 Oct 2008 10:41:50 -0700
Subject: pkt_sched: Simplify dev_requeue_skb and dequeue_skb

qdisc->requeue was planned to universally replace all requeuing code,
but at the top level we never requeue more than one skb, so qdisc->
gso_skb is enough for this. qdisc->requeue would be used on the lower
levels only for one level deep requeuing (like in sch_hfsc) after
finishing all the changes.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3db4cf1bd26..31f6b614b59 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,12 +44,9 @@ static inline int qdisc_qlen(struct Qdisc *q)
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	if (unlikely(skb->next))
-		q->gso_skb = skb;
-	else
-		__skb_queue_head(&q->requeue, skb);
-
+	q->gso_skb = skb;
 	__netif_schedule(q);
+
 	return 0;
 }
 
@@ -57,24 +54,16 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
 	struct sk_buff *skb = q->gso_skb;
 
-	if (!skb)
-		skb = skb_peek(&q->requeue);
-
 	if (unlikely(skb)) {
 		struct net_device *dev = qdisc_dev(q);
 		struct netdev_queue *txq;
 
 		/* check the reason of requeuing without tx lock first */
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-		if (!netif_tx_queue_stopped(txq) &&
-		    !netif_tx_queue_frozen(txq)) {
-			if (q->gso_skb)
-				q->gso_skb = NULL;
-			else
-				__skb_unlink(skb, &q->requeue);
-		} else {
+		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
+			q->gso_skb = NULL;
+		else
 			skb = NULL;
-		}
 	} else {
 		skb = q->dequeue(q);
 	}
-- 
cgit v1.2.3


From c7004482e8dcb7c3c72666395cfa98a216a4fb70 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 6 Oct 2008 10:43:54 -0700
Subject: tcp: Respect SO_RCVLOWAT in tcp_poll().

Based upon a report by Vito Caputo.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1ab341e5d3e..7d81a1ee550 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -384,13 +384,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	/* Connected? */
 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		int target = sock_rcvlowat(sk, 0, INT_MAX);
+
+		if (tp->urg_seq == tp->copied_seq &&
+		    !sock_flag(sk, SOCK_URGINLINE) &&
+		    tp->urg_data)
+			target--;
+
 		/* Potential race condition. If read of tp below will
 		 * escape above sk->sk_state, we can be illegally awaken
 		 * in SYN_* states. */
-		if ((tp->rcv_nxt != tp->copied_seq) &&
-		    (tp->urg_seq != tp->copied_seq ||
-		     tp->rcv_nxt != tp->copied_seq + 1 ||
-		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
+		if (tp->rcv_nxt - tp->copied_seq >= target)
 			mask |= POLLIN | POLLRDNORM;
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-- 
cgit v1.2.3


From 64be8608c163bd480cf5ec4b34366f11e0f3c87f Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Mon, 6 Oct 2008 14:45:18 -0500
Subject: svcrdma: Add FRMR get/put services

Add services for the allocating, freeing, and unmapping Fast Reg MR. These
services will be used by the transport connection setup, send and receive
routines.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 116 +++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 900cb69728c..f0b5c5f2f62 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -100,6 +100,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 	ctxt->xprt = xprt;
 	INIT_LIST_HEAD(&ctxt->dto_q);
 	ctxt->count = 0;
+	ctxt->frmr = NULL;
 	atomic_inc(&xprt->sc_ctxt_used);
 	return ctxt;
 }
@@ -109,11 +110,19 @@ static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 	struct svcxprt_rdma *xprt = ctxt->xprt;
 	int i;
 	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
-		atomic_dec(&xprt->sc_dma_used);
-		ib_dma_unmap_single(xprt->sc_cm_id->device,
-				    ctxt->sge[i].addr,
-				    ctxt->sge[i].length,
-				    ctxt->direction);
+		/*
+		 * Unmap the DMA addr in the SGE if the lkey matches
+		 * the sc_dma_lkey, otherwise, ignore it since it is
+		 * an FRMR lkey and will be unmapped later when the
+		 * last WR that uses it completes.
+		 */
+		if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+			atomic_dec(&xprt->sc_dma_used);
+			ib_dma_unmap_single(xprt->sc_cm_id->device,
+					    ctxt->sge[i].addr,
+					    ctxt->sge[i].length,
+					    ctxt->direction);
+		}
 	}
 }
 
@@ -150,6 +159,7 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
 		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
 	}
 	map->count = 0;
+	map->frmr = NULL;
 	return map;
 }
 
@@ -425,10 +435,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
 
 	cma_xprt->sc_ord = svcrdma_ord;
 
@@ -686,6 +698,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 	return ERR_PTR(ret);
 }
 
+static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
+{
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *pl;
+	struct svc_rdma_fastreg_mr *frmr;
+
+	frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
+	if (!frmr)
+		goto err;
+
+	mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+	if (!mr)
+		goto err_free_frmr;
+
+	pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
+					 RPCSVC_MAXPAGES);
+	if (!pl)
+		goto err_free_mr;
+
+	frmr->mr = mr;
+	frmr->page_list = pl;
+	INIT_LIST_HEAD(&frmr->frmr_list);
+	return frmr;
+
+ err_free_mr:
+	ib_dereg_mr(mr);
+ err_free_frmr:
+	kfree(frmr);
+ err:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_fastreg_mr *frmr;
+
+	while (!list_empty(&xprt->sc_frmr_q)) {
+		frmr = list_entry(xprt->sc_frmr_q.next,
+				  struct svc_rdma_fastreg_mr, frmr_list);
+		list_del_init(&frmr->frmr_list);
+		ib_dereg_mr(frmr->mr);
+		ib_free_fast_reg_page_list(frmr->page_list);
+		kfree(frmr);
+	}
+}
+
+struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_fastreg_mr *frmr = NULL;
+
+	spin_lock_bh(&rdma->sc_frmr_q_lock);
+	if (!list_empty(&rdma->sc_frmr_q)) {
+		frmr = list_entry(rdma->sc_frmr_q.next,
+				  struct svc_rdma_fastreg_mr, frmr_list);
+		list_del_init(&frmr->frmr_list);
+		frmr->map_len = 0;
+		frmr->page_list_len = 0;
+	}
+	spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	if (frmr)
+		return frmr;
+
+	return rdma_alloc_frmr(rdma);
+}
+
+static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
+			   struct svc_rdma_fastreg_mr *frmr)
+{
+	int page_no;
+	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+		dma_addr_t addr = frmr->page_list->page_list[page_no];
+		if (ib_dma_mapping_error(frmr->mr->device, addr))
+			continue;
+		atomic_dec(&xprt->sc_dma_used);
+		ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE,
+				    frmr->direction);
+	}
+}
+
+void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
+		       struct svc_rdma_fastreg_mr *frmr)
+{
+	if (frmr) {
+		frmr_unmap_dma(rdma, frmr);
+		spin_lock_bh(&rdma->sc_frmr_q_lock);
+		BUG_ON(!list_empty(&frmr->frmr_list));
+		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
+		spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	}
+}
+
 /*
  * This is the xpo_recvfrom function for listening endpoints. Its
  * purpose is to accept incoming connections. The CMA callback handler
@@ -961,6 +1064,9 @@ static void __svc_rdma_free(struct work_struct *work)
 	WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
 	WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
 
+	/* De-allocate fastreg mr */
+	rdma_dealloc_frmr_q(rdma);
+
 	/* Destroy the QP if present (not a listener) */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
 		ib_destroy_qp(rdma->sc_qp);
-- 
cgit v1.2.3


From 3a5c63803d0552a3ad93b85c262f12cd86471443 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 30 Sep 2008 13:46:13 -0500
Subject: svcrdma: Query device for Fast Reg support during connection setup

Query the device capabilities in the svc_rdma_accept function to determine
what advanced memory management capabilities are supported by the device.
Based on the query, select the most secure model available given the
requirements of the transport and capabilities of the adapter.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 76 +++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f0b5c5f2f62..a8ec4b1eec5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -807,6 +807,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	struct rdma_conn_param conn_param;
 	struct ib_qp_init_attr qp_attr;
 	struct ib_device_attr devattr;
+	int dma_mr_acc;
+	int need_dma_mr;
 	int ret;
 	int i;
 
@@ -922,15 +924,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	}
 	newxprt->sc_qp = newxprt->sc_cm_id->qp;
 
-	/* Register all of physical memory */
-	newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
-					    IB_ACCESS_LOCAL_WRITE |
-					    IB_ACCESS_REMOTE_WRITE);
-	if (IS_ERR(newxprt->sc_phys_mr)) {
-		dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
+	/*
+	 * Use the most secure set of MR resources based on the
+	 * transport type and available memory management features in
+	 * the device. Here's the table implemented below:
+	 *
+	 *		Fast	Global	DMA	Remote WR
+	 *		Reg	LKEY	MR	Access
+	 *		Sup'd	Sup'd	Needed	Needed
+	 *
+	 * IWARP	N	N	Y	Y
+	 *		N	Y	Y	Y
+	 *		Y	N	Y	N
+	 *		Y	Y	N	-
+	 *
+	 * IB		N	N	Y	N
+	 *		N	Y	N	-
+	 *		Y	N	Y	N
+	 *		Y	Y	N	-
+	 *
+	 * NB:	iWARP requires remote write access for the data sink
+	 *	of an RDMA_READ. IB does not.
+	 */
+	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		newxprt->sc_frmr_pg_list_len =
+			devattr.max_fast_reg_page_list_len;
+		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+	}
+
+	/*
+	 * Determine if a DMA MR is required and if so, what privs are required
+	 */
+	switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
+	case RDMA_TRANSPORT_IWARP:
+		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+			need_dma_mr = 1;
+			dma_mr_acc =
+				(IB_ACCESS_LOCAL_WRITE |
+				 IB_ACCESS_REMOTE_WRITE);
+		} else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else
+			need_dma_mr = 0;
+		break;
+	case RDMA_TRANSPORT_IB:
+		if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else
+			need_dma_mr = 0;
+		break;
+	default:
 		goto errout;
 	}
 
+	/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
+	if (need_dma_mr) {
+		/* Register all of physical memory */
+		newxprt->sc_phys_mr =
+			ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
+		if (IS_ERR(newxprt->sc_phys_mr)) {
+			dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
+				ret);
+			goto errout;
+		}
+		newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
+	} else
+		newxprt->sc_dma_lkey =
+			newxprt->sc_cm_id->device->local_dma_lkey;
+
 	/* Post receive buffers */
 	for (i = 0; i < newxprt->sc_max_requests; i++) {
 		ret = svc_rdma_post_recv(newxprt);
-- 
cgit v1.2.3


From e1183210625cc8e02ce13eec78fb7a246567fc59 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Fri, 3 Oct 2008 15:22:18 -0500
Subject: svcrdma: Add a service to register a Fast Reg MR with the device

Fast Reg MR introduces a new WR type. Add a service to register the
region with the adapter and update the completion handling to support
completions with a NULL WR context.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 111 +++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index a8ec4b1eec5..c3e8db0eeb3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -325,6 +325,45 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 		svc_xprt_enqueue(&xprt->sc_xprt);
 }
 
+/*
+ * Processs a completion context
+ */
+static void process_context(struct svcxprt_rdma *xprt,
+			    struct svc_rdma_op_ctxt *ctxt)
+{
+	svc_rdma_unmap_dma(ctxt);
+
+	switch (ctxt->wr_op) {
+	case IB_WR_SEND:
+		svc_rdma_put_context(ctxt, 1);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+		svc_rdma_put_context(ctxt, 0);
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+			BUG_ON(!read_hdr);
+			spin_lock_bh(&xprt->sc_rq_dto_lock);
+			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+			list_add_tail(&read_hdr->dto_q,
+				      &xprt->sc_read_complete_q);
+			spin_unlock_bh(&xprt->sc_rq_dto_lock);
+			svc_xprt_enqueue(&xprt->sc_xprt);
+		}
+		svc_rdma_put_context(ctxt, 0);
+		break;
+
+	default:
+		printk(KERN_ERR "svcrdma: unexpected completion type, "
+		       "opcode=%d\n",
+		       ctxt->wr_op);
+		break;
+	}
+}
+
 /*
  * Send Queue Completion Handler - potentially called on interrupt context.
  *
@@ -337,17 +376,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
 	struct ib_cq *cq = xprt->sc_sq_cq;
 	int ret;
 
-
 	if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
 		return;
 
 	ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
 	atomic_inc(&rdma_stat_sq_poll);
 	while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
-		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
-		xprt = ctxt->xprt;
-
-		svc_rdma_unmap_dma(ctxt);
 		if (wc.status != IB_WC_SUCCESS)
 			/* Close the transport */
 			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -356,35 +390,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
 		atomic_dec(&xprt->sc_sq_count);
 		wake_up(&xprt->sc_send_wait);
 
-		switch (ctxt->wr_op) {
-		case IB_WR_SEND:
-			svc_rdma_put_context(ctxt, 1);
-			break;
-
-		case IB_WR_RDMA_WRITE:
-			svc_rdma_put_context(ctxt, 0);
-			break;
-
-		case IB_WR_RDMA_READ:
-			if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
-				struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
-				BUG_ON(!read_hdr);
-				spin_lock_bh(&xprt->sc_rq_dto_lock);
-				set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
-				list_add_tail(&read_hdr->dto_q,
-					      &xprt->sc_read_complete_q);
-				spin_unlock_bh(&xprt->sc_rq_dto_lock);
-				svc_xprt_enqueue(&xprt->sc_xprt);
-			}
-			svc_rdma_put_context(ctxt, 0);
-			break;
+		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+		if (ctxt)
+			process_context(xprt, ctxt);
 
-		default:
-			printk(KERN_ERR "svcrdma: unexpected completion type, "
-			       "opcode=%d, status=%d\n",
-			       wc.opcode, wc.status);
-			break;
-		}
 		svc_xprt_put(&xprt->sc_xprt);
 	}
 
@@ -1184,6 +1193,40 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
 	return 1;
 }
 
+/*
+ * Attempt to register the kvec representing the RPC memory with the
+ * device.
+ *
+ * Returns:
+ *  NULL : The device does not support fastreg or there were no more
+ *         fastreg mr.
+ *  frmr : The kvec register request was successfully posted.
+ *    <0 : An error was encountered attempting to register the kvec.
+ */
+int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
+		     struct svc_rdma_fastreg_mr *frmr)
+{
+	struct ib_send_wr fastreg_wr;
+	u8 key;
+
+	/* Bump the key */
+	key = (u8)(frmr->mr->lkey & 0x000000FF);
+	ib_update_fast_reg_key(frmr->mr, ++key);
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof fastreg_wr);
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.send_flags = IB_SEND_SIGNALED;
+	fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+	fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+	fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = frmr->map_len;
+	fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+	fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+	return svc_rdma_send(xprt, &fastreg_wr);
+}
+
 int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 {
 	struct ib_send_wr *bad_wr;
@@ -1193,8 +1236,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 		return -ENOTCONN;
 
 	BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
-	BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
-		wr->opcode);
 	/* If the SQ is full, wait until an SQ entry is available */
 	while (1) {
 		spin_lock_bh(&xprt->sc_lock);
-- 
cgit v1.2.3


From a5abf4e81545d9c7280c49cae853cc45fd769ddf Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 30 Sep 2008 14:05:41 -0500
Subject: svcrdma: Modify post recv path to use local dma key

Update the svc_rdma_post_recv routine to use the adapter's global LKEY
instead of sc_phys_mr which is only valid when using a DMA MR.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index c3e8db0eeb3..d9183cbcd96 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -483,7 +483,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 	struct ib_recv_wr recv_wr, *bad_recv_wr;
 	struct svc_rdma_op_ctxt *ctxt;
 	struct page *page;
-	unsigned long pa;
+	dma_addr_t pa;
 	int sge_no;
 	int buflen;
 	int ret;
@@ -495,13 +495,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 		BUG_ON(sge_no >= xprt->sc_max_sge);
 		page = svc_rdma_get_page();
 		ctxt->pages[sge_no] = page;
-		atomic_inc(&xprt->sc_dma_used);
 		pa = ib_dma_map_page(xprt->sc_cm_id->device,
 				     page, 0, PAGE_SIZE,
 				     DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
+			goto err_put_ctxt;
+		atomic_inc(&xprt->sc_dma_used);
 		ctxt->sge[sge_no].addr = pa;
 		ctxt->sge[sge_no].length = PAGE_SIZE;
-		ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+		ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
 		buflen += PAGE_SIZE;
 	}
 	ctxt->count = sge_no;
@@ -517,6 +519,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 		svc_rdma_put_context(ctxt, 1);
 	}
 	return ret;
+
+ err_put_ctxt:
+	svc_rdma_put_context(ctxt, 1);
+	return -ENOMEM;
 }
 
 /*
-- 
cgit v1.2.3


From 5b180a9a64ca2217a658bd515ef910eafefc5e5a Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Mon, 11 Aug 2008 14:10:19 -0500
Subject: svcrdma: Add support to svc_rdma_send to handle chained WR

WR can be submitted as linked lists of WR. Update the svc_rdma_send
routine to handle WR chains. This will be used to submit a WR that
uses an FRMR with another WR that invalidates the FRMR.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index d9183cbcd96..f22f5876766 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1235,17 +1235,23 @@ int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
 
 int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 {
-	struct ib_send_wr *bad_wr;
+	struct ib_send_wr *bad_wr, *n_wr;
+	int wr_count;
+	int i;
 	int ret;
 
 	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
 		return -ENOTCONN;
 
 	BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
+	wr_count = 1;
+	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+		wr_count++;
+
 	/* If the SQ is full, wait until an SQ entry is available */
 	while (1) {
 		spin_lock_bh(&xprt->sc_lock);
-		if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
+		if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
 			spin_unlock_bh(&xprt->sc_lock);
 			atomic_inc(&rdma_stat_sq_starve);
 
@@ -1260,19 +1266,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 				return 0;
 			continue;
 		}
-		/* Bumped used SQ WR count and post */
-		svc_xprt_get(&xprt->sc_xprt);
+		/* Take a transport ref for each WR posted */
+		for (i = 0; i < wr_count; i++)
+			svc_xprt_get(&xprt->sc_xprt);
+
+		/* Bump used SQ WR count and post */
+		atomic_add(wr_count, &xprt->sc_sq_count);
 		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
-		if (!ret)
-			atomic_inc(&xprt->sc_sq_count);
-		else {
-			svc_xprt_put(&xprt->sc_xprt);
+		if (ret) {
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			atomic_sub(wr_count, &xprt->sc_sq_count);
+			for (i = 0; i < wr_count; i ++)
+				svc_xprt_put(&xprt->sc_xprt);
 			dprintk("svcrdma: failed to post SQ WR rc=%d, "
 			       "sc_sq_count=%d, sc_sq_depth=%d\n",
 			       ret, atomic_read(&xprt->sc_sq_count),
 			       xprt->sc_sq_depth);
 		}
 		spin_unlock_bh(&xprt->sc_lock);
+		if (ret)
+			wake_up(&xprt->sc_send_wait);
 		break;
 	}
 	return ret;
-- 
cgit v1.2.3


From 146b6df6a537939570c5772ebd7db826fdbd5d82 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 12 Aug 2008 15:12:10 -0500
Subject: svcrdma: Modify the RPC recv path to use FRMR when available

RPCRDMA requests that specify a read-list are fetched with RDMA_READ. Using
an FRMR to map the data sink improves NFSRDMA security on transports that
place the RDMA_READ data sink LKEY on the wire because the valid lifetime
of the MR is only the duration of the RDMA_READ. The LKEY is invalidated
when the last RDMA_READ WR completes.

Mapping the data sink also allows for very large amounts to data to be
fetched with a single WR, so if the client is also using FRMR, the entire
RPC read-list can be fetched with a single WR.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 187 +++++++++++++++++++++++++++----
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   5 +-
 2 files changed, 170 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 74de31a0661..a4756576d68 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -116,7 +116,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
  *
  * Assumptions:
  * - chunk[0]->position points to pages[0] at an offset of 0
- * - pages[] is not physically or virtually contigous and consists of
+ * - pages[] is not physically or virtually contiguous and consists of
  *   PAGE_SIZE elements.
  *
  * Output:
@@ -125,7 +125,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
  *   chunk in the read list
  *
  */
-static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
+static int map_read_chunks(struct svcxprt_rdma *xprt,
 			   struct svc_rqst *rqstp,
 			   struct svc_rdma_op_ctxt *head,
 			   struct rpcrdma_msg *rmsgp,
@@ -211,26 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
 	return sge_no;
 }
 
-static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
-			      struct svc_rdma_op_ctxt *ctxt,
-			      struct kvec *vec,
-			      u64 *sgl_offset,
-			      int count)
+/* Map a read-chunk-list to an XDR and fast register the page-list.
+ *
+ * Assumptions:
+ * - chunk[0]	position points to pages[0] at an offset of 0
+ * - pages[]	will be made physically contiguous by creating a one-off memory
+ *		region using the fastreg verb.
+ * - byte_count is # of bytes in read-chunk-list
+ * - ch_count	is # of chunks in read-chunk-list
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ *   chunk in the read list
+ */
+static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+				struct svc_rqst *rqstp,
+				struct svc_rdma_op_ctxt *head,
+				struct rpcrdma_msg *rmsgp,
+				struct svc_rdma_req_map *rpl_map,
+				struct svc_rdma_req_map *chl_map,
+				int ch_count,
+				int byte_count)
+{
+	int page_no;
+	int ch_no;
+	u32 offset;
+	struct rpcrdma_read_chunk *ch;
+	struct svc_rdma_fastreg_mr *frmr;
+	int ret = 0;
+
+	frmr = svc_rdma_get_frmr(xprt);
+	if (IS_ERR(frmr))
+		return -ENOMEM;
+
+	head->frmr = frmr;
+	head->arg.head[0] = rqstp->rq_arg.head[0];
+	head->arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->arg.pages = &head->pages[head->count];
+	head->hdr_count = head->count; /* save count of hdr pages */
+	head->arg.page_base = 0;
+	head->arg.page_len = byte_count;
+	head->arg.len = rqstp->rq_arg.len + byte_count;
+	head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
+
+	/* Fast register the page list */
+	frmr->kva = page_address(rqstp->rq_arg.pages[0]);
+	frmr->direction = DMA_FROM_DEVICE;
+	frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+	frmr->map_len = byte_count;
+	frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
+	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+		frmr->page_list->page_list[page_no] =
+			ib_dma_map_single(xprt->sc_cm_id->device,
+					  page_address(rqstp->rq_arg.pages[page_no]),
+					  PAGE_SIZE, DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 frmr->page_list->page_list[page_no]))
+			goto fatal_err;
+		atomic_inc(&xprt->sc_dma_used);
+		head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+	}
+	head->count += page_no;
+
+	/* rq_respages points one past arg pages */
+	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+
+	/* Create the reply and chunk maps */
+	offset = 0;
+	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+	for (ch_no = 0; ch_no < ch_count; ch_no++) {
+		rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
+		rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
+		chl_map->ch[ch_no].count = 1;
+		chl_map->ch[ch_no].start = ch_no;
+		offset += ch->rc_target.rs_length;
+		ch++;
+	}
+
+	ret = svc_rdma_fastreg(xprt, frmr);
+	if (ret)
+		goto fatal_err;
+
+	return ch_no;
+
+ fatal_err:
+	printk("svcrdma: error fast registering xdr for xprt %p", xprt);
+	svc_rdma_put_frmr(xprt, frmr);
+	return -EIO;
+}
+
+static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+			     struct svc_rdma_op_ctxt *ctxt,
+			     struct svc_rdma_fastreg_mr *frmr,
+			     struct kvec *vec,
+			     u64 *sgl_offset,
+			     int count)
 {
 	int i;
 
 	ctxt->count = count;
 	ctxt->direction = DMA_FROM_DEVICE;
 	for (i = 0; i < count; i++) {
-		atomic_inc(&xprt->sc_dma_used);
-		ctxt->sge[i].addr =
-			ib_dma_map_single(xprt->sc_cm_id->device,
-					  vec[i].iov_base, vec[i].iov_len,
-					  DMA_FROM_DEVICE);
+		ctxt->sge[i].length = 0; /* in case map fails */
+		if (!frmr) {
+			ctxt->sge[i].addr =
+				ib_dma_map_single(xprt->sc_cm_id->device,
+						  vec[i].iov_base,
+						  vec[i].iov_len,
+						  DMA_FROM_DEVICE);
+			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+						 ctxt->sge[i].addr))
+				return -EINVAL;
+			ctxt->sge[i].lkey = xprt->sc_dma_lkey;
+			atomic_inc(&xprt->sc_dma_used);
+		} else {
+			ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
+			ctxt->sge[i].lkey = frmr->mr->lkey;
+		}
 		ctxt->sge[i].length = vec[i].iov_len;
-		ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
 		*sgl_offset = *sgl_offset + vec[i].iov_len;
 	}
+	return 0;
 }
 
 static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
@@ -278,6 +380,7 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 			 struct svc_rdma_op_ctxt *hdr_ctxt)
 {
 	struct ib_send_wr read_wr;
+	struct ib_send_wr inv_wr;
 	int err = 0;
 	int ch_no;
 	int ch_count;
@@ -301,9 +404,20 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
 	if (ch_count > RPCSVC_MAXPAGES)
 		return -EINVAL;
-	sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
-				    rpl_map, chl_map,
-				    ch_count, byte_count);
+
+	if (!xprt->sc_frmr_pg_list_len)
+		sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+					    rpl_map, chl_map, ch_count,
+					    byte_count);
+	else
+		sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+						 rpl_map, chl_map, ch_count,
+						 byte_count);
+	if (sge_count < 0) {
+		err = -EIO;
+		goto out;
+	}
+
 	sgl_offset = 0;
 	ch_no = 0;
 
@@ -312,13 +426,16 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 next_sge:
 		ctxt = svc_rdma_get_context(xprt);
 		ctxt->direction = DMA_FROM_DEVICE;
+		ctxt->frmr = hdr_ctxt->frmr;
+		ctxt->read_hdr = NULL;
 		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
 
 		/* Prepare READ WR */
 		memset(&read_wr, 0, sizeof read_wr);
-		ctxt->wr_op = IB_WR_RDMA_READ;
 		read_wr.wr_id = (unsigned long)ctxt;
 		read_wr.opcode = IB_WR_RDMA_READ;
+		ctxt->wr_op = read_wr.opcode;
 		read_wr.send_flags = IB_SEND_SIGNALED;
 		read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
 		read_wr.wr.rdma.remote_addr =
@@ -327,10 +444,15 @@ next_sge:
 		read_wr.sg_list = ctxt->sge;
 		read_wr.num_sge =
 			rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
-		rdma_set_ctxt_sge(xprt, ctxt,
-				  &rpl_map->sge[chl_map->ch[ch_no].start],
-				  &sgl_offset,
-				  read_wr.num_sge);
+		err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
+					&rpl_map->sge[chl_map->ch[ch_no].start],
+					&sgl_offset,
+					read_wr.num_sge);
+		if (err) {
+			svc_rdma_unmap_dma(ctxt);
+			svc_rdma_put_context(ctxt, 0);
+			goto out;
+		}
 		if (((ch+1)->rc_discrim == 0) &&
 		    (read_wr.num_sge == chl_map->ch[ch_no].count)) {
 			/*
@@ -339,6 +461,29 @@ next_sge:
 			 * the client and the RPC needs to be enqueued.
 			 */
 			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+			if (hdr_ctxt->frmr) {
+				set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+				/*
+				 * Invalidate the local MR used to map the data
+				 * sink.
+				 */
+				if (xprt->sc_dev_caps &
+				    SVCRDMA_DEVCAP_READ_W_INV) {
+					read_wr.opcode =
+						IB_WR_RDMA_READ_WITH_INV;
+					ctxt->wr_op = read_wr.opcode;
+					read_wr.ex.invalidate_rkey =
+						ctxt->frmr->mr->lkey;
+				} else {
+					/* Prepare INVALIDATE WR */
+					memset(&inv_wr, 0, sizeof inv_wr);
+					inv_wr.opcode = IB_WR_LOCAL_INV;
+					inv_wr.send_flags = IB_SEND_SIGNALED;
+					inv_wr.ex.invalidate_rkey =
+						hdr_ctxt->frmr->mr->lkey;
+					read_wr.next = &inv_wr;
+				}
+			}
 			ctxt->read_hdr = hdr_ctxt;
 		}
 		/* Post the read */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f22f5876766..fb0dff5e53e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -105,7 +105,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 	return ctxt;
 }
 
-static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 {
 	struct svcxprt_rdma *xprt = ctxt->xprt;
 	int i;
@@ -343,9 +343,12 @@ static void process_context(struct svcxprt_rdma *xprt,
 		break;
 
 	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
 		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
 			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
 			BUG_ON(!read_hdr);
+			if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+				svc_rdma_put_frmr(xprt, ctxt->frmr);
 			spin_lock_bh(&xprt->sc_rq_dto_lock);
 			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
 			list_add_tail(&read_hdr->dto_q,
-- 
cgit v1.2.3


From afd566ea080572499cc01d42d2f578bf4b54f20f Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Fri, 3 Oct 2008 15:45:03 -0500
Subject: svcrdma: Modify the RPC reply path to use FRMR when available

Use FRMR to map local RPC reply data. This allows RDMA_WRITE to send reply
data using a single WR. The FRMR is invalidated by linking the LOCAL_INV WR
to the RDMA_SEND message used to complete the reply.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 255 ++++++++++++++++++++++++++-----
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   2 +
 2 files changed, 217 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 84d328329d9..9a7a8e7ae03 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -69,9 +69,127 @@
  * array is only concerned with the reply we are assured that we have
  * on extra page for the RPCRMDA header.
  */
-static void xdr_to_sge(struct svcxprt_rdma *xprt,
-		       struct xdr_buf *xdr,
-		       struct svc_rdma_req_map *vec)
+int fast_reg_xdr(struct svcxprt_rdma *xprt,
+		 struct xdr_buf *xdr,
+		 struct svc_rdma_req_map *vec)
+{
+	int sge_no;
+	u32 sge_bytes;
+	u32 page_bytes;
+	u32 page_off;
+	int page_no = 0;
+	u8 *frva;
+	struct svc_rdma_fastreg_mr *frmr;
+
+	frmr = svc_rdma_get_frmr(xprt);
+	if (IS_ERR(frmr))
+		return -ENOMEM;
+	vec->frmr = frmr;
+
+	/* Skip the RPCRDMA header */
+	sge_no = 1;
+
+	/* Map the head. */
+	frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
+	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+	vec->count = 2;
+	sge_no++;
+
+	/* Build the FRMR */
+	frmr->kva = frva;
+	frmr->direction = DMA_TO_DEVICE;
+	frmr->access_flags = 0;
+	frmr->map_len = PAGE_SIZE;
+	frmr->page_list_len = 1;
+	frmr->page_list->page_list[page_no] =
+		ib_dma_map_single(xprt->sc_cm_id->device,
+				  (void *)xdr->head[0].iov_base,
+				  PAGE_SIZE, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+				 frmr->page_list->page_list[page_no]))
+		goto fatal_err;
+	atomic_inc(&xprt->sc_dma_used);
+
+	page_off = xdr->page_base;
+	page_bytes = xdr->page_len + page_off;
+	if (!page_bytes)
+		goto encode_tail;
+
+	/* Map the pages */
+	vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+	vec->sge[sge_no].iov_len = page_bytes;
+	sge_no++;
+	while (page_bytes) {
+		struct page *page;
+
+		page = xdr->pages[page_no++];
+		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+		page_bytes -= sge_bytes;
+
+		frmr->page_list->page_list[page_no] =
+			ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
+					  PAGE_SIZE, DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 frmr->page_list->page_list[page_no]))
+			goto fatal_err;
+
+		atomic_inc(&xprt->sc_dma_used);
+		page_off = 0; /* reset for next time through loop */
+		frmr->map_len += PAGE_SIZE;
+		frmr->page_list_len++;
+	}
+	vec->count++;
+
+ encode_tail:
+	/* Map tail */
+	if (0 == xdr->tail[0].iov_len)
+		goto done;
+
+	vec->count++;
+	vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+
+	if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
+	    ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
+		/*
+		 * If head and tail use the same page, we don't need
+		 * to map it again.
+		 */
+		vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+	} else {
+		void *va;
+
+		/* Map another page for the tail */
+		page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+		va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
+		vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+
+		frmr->page_list->page_list[page_no] =
+			ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE,
+					  DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 frmr->page_list->page_list[page_no]))
+			goto fatal_err;
+		atomic_inc(&xprt->sc_dma_used);
+		frmr->map_len += PAGE_SIZE;
+		frmr->page_list_len++;
+	}
+
+ done:
+	if (svc_rdma_fastreg(xprt, frmr))
+		goto fatal_err;
+
+	return 0;
+
+ fatal_err:
+	printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
+	svc_rdma_put_frmr(xprt, frmr);
+	return -EIO;
+}
+
+static int map_xdr(struct svcxprt_rdma *xprt,
+		   struct xdr_buf *xdr,
+		   struct svc_rdma_req_map *vec)
 {
 	int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
 	int sge_no;
@@ -83,6 +201,9 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
 	BUG_ON(xdr->len !=
 	       (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
 
+	if (xprt->sc_frmr_pg_list_len)
+		return fast_reg_xdr(xprt, xdr, vec);
+
 	/* Skip the first sge, this is for the RPCRDMA header */
 	sge_no = 1;
 
@@ -116,9 +237,12 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
 
 	BUG_ON(sge_no > sge_max);
 	vec->count = sge_no;
+	return 0;
 }
 
 /* Assumptions:
+ * - We are using FRMR
+ *     - or -
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
 static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
@@ -158,30 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 	sge_no = 0;
 
 	/* Copy the remaining SGE */
-	while (bc != 0 && xdr_sge_no < vec->count) {
-		sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
-		sge_bytes = min((size_t)bc,
-				(size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
+	while (bc != 0) {
+		sge_bytes = min_t(size_t,
+			  bc, vec->sge[xdr_sge_no].iov_len-sge_off);
 		sge[sge_no].length = sge_bytes;
-		atomic_inc(&xprt->sc_dma_used);
-		sge[sge_no].addr =
-			ib_dma_map_single(xprt->sc_cm_id->device,
-					  (void *)
-					  vec->sge[xdr_sge_no].iov_base + sge_off,
-					  sge_bytes, DMA_TO_DEVICE);
-		if (dma_mapping_error(xprt->sc_cm_id->device->dma_device,
-					sge[sge_no].addr))
-			goto err;
+		if (!vec->frmr) {
+			sge[sge_no].addr =
+				ib_dma_map_single(xprt->sc_cm_id->device,
+						  (void *)
+						  vec->sge[xdr_sge_no].iov_base + sge_off,
+						  sge_bytes, DMA_TO_DEVICE);
+			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+						 sge[sge_no].addr))
+				goto err;
+			atomic_inc(&xprt->sc_dma_used);
+			sge[sge_no].lkey = xprt->sc_dma_lkey;
+		} else {
+			sge[sge_no].addr = (unsigned long)
+				vec->sge[xdr_sge_no].iov_base + sge_off;
+			sge[sge_no].lkey = vec->frmr->mr->lkey;
+		}
+		ctxt->count++;
+		ctxt->frmr = vec->frmr;
 		sge_off = 0;
 		sge_no++;
-		ctxt->count++;
 		xdr_sge_no++;
+		BUG_ON(xdr_sge_no > vec->count);
 		bc -= sge_bytes;
 	}
 
-	BUG_ON(bc != 0);
-	BUG_ON(xdr_sge_no > vec->count);
-
 	/* Prepare WRITE WR */
 	memset(&write_wr, 0, sizeof write_wr);
 	ctxt->wr_op = IB_WR_RDMA_WRITE;
@@ -226,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
 	res_ary = (struct rpcrdma_write_array *)
 		&rdma_resp->rm_body.rm_chunks[1];
 
-	max_write = xprt->sc_max_sge * PAGE_SIZE;
+	if (vec->frmr)
+		max_write = vec->frmr->map_len;
+	else
+		max_write = xprt->sc_max_sge * PAGE_SIZE;
 
 	/* Write chunks start at the pagelist */
 	for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
@@ -297,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
 	res_ary = (struct rpcrdma_write_array *)
 		&rdma_resp->rm_body.rm_chunks[2];
 
-	max_write = xprt->sc_max_sge * PAGE_SIZE;
+	if (vec->frmr)
+		max_write = vec->frmr->map_len;
+	else
+		max_write = xprt->sc_max_sge * PAGE_SIZE;
 
 	/* xdr offset starts at RPC message */
 	for (xdr_off = 0, chunk_no = 0;
@@ -307,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
 		ch = &arg_ary->wc_array[chunk_no].wc_target;
 		write_len = min(xfer_len, ch->rs_length);
 
-
 		/* Prepare the reply chunk given the length actually
 		 * written */
 		rs_offset = get_unaligned(&(ch->rs_offset));
@@ -366,6 +500,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		      int byte_count)
 {
 	struct ib_send_wr send_wr;
+	struct ib_send_wr inv_wr;
 	int sge_no;
 	int sge_bytes;
 	int page_no;
@@ -385,27 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	/* Prepare the context */
 	ctxt->pages[0] = page;
 	ctxt->count = 1;
+	ctxt->frmr = vec->frmr;
+	if (vec->frmr)
+		set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+	else
+		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
 
 	/* Prepare the SGE for the RPCRDMA Header */
-	atomic_inc(&rdma->sc_dma_used);
 	ctxt->sge[0].addr =
 		ib_dma_map_page(rdma->sc_cm_id->device,
 				page, 0, PAGE_SIZE, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+		goto err;
+	atomic_inc(&rdma->sc_dma_used);
+
 	ctxt->direction = DMA_TO_DEVICE;
+
 	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
-	ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
+	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
 
 	/* Determine how many of our SGE are to be transmitted */
 	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
 		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
 		byte_count -= sge_bytes;
-		atomic_inc(&rdma->sc_dma_used);
-		ctxt->sge[sge_no].addr =
-			ib_dma_map_single(rdma->sc_cm_id->device,
-					  vec->sge[sge_no].iov_base,
-					  sge_bytes, DMA_TO_DEVICE);
+		if (!vec->frmr) {
+			ctxt->sge[sge_no].addr =
+				ib_dma_map_single(rdma->sc_cm_id->device,
+						  vec->sge[sge_no].iov_base,
+						  sge_bytes, DMA_TO_DEVICE);
+			if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+						 ctxt->sge[sge_no].addr))
+				goto err;
+			atomic_inc(&rdma->sc_dma_used);
+			ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+		} else {
+			ctxt->sge[sge_no].addr = (unsigned long)
+				vec->sge[sge_no].iov_base;
+			ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
+		}
 		ctxt->sge[sge_no].length = sge_bytes;
-		ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
 	}
 	BUG_ON(byte_count != 0);
 
@@ -417,11 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
 		ctxt->count++;
 		rqstp->rq_respages[page_no] = NULL;
-		/* If there are more pages than SGE, terminate SGE list */
+		/*
+		 * If there are more pages than SGE, terminate SGE
+		 * list so that svc_rdma_unmap_dma doesn't attempt to
+		 * unmap garbage.
+		 */
 		if (page_no+1 >= sge_no)
 			ctxt->sge[page_no+1].length = 0;
 	}
 	BUG_ON(sge_no > rdma->sc_max_sge);
+	BUG_ON(sge_no > ctxt->count);
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
 	send_wr.wr_id = (unsigned long)ctxt;
@@ -429,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	send_wr.num_sge = sge_no;
 	send_wr.opcode = IB_WR_SEND;
 	send_wr.send_flags =  IB_SEND_SIGNALED;
+	if (vec->frmr) {
+		/* Prepare INVALIDATE WR */
+		memset(&inv_wr, 0, sizeof inv_wr);
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.send_flags = IB_SEND_SIGNALED;
+		inv_wr.ex.invalidate_rkey =
+			vec->frmr->mr->lkey;
+		send_wr.next = &inv_wr;
+	}
 
 	ret = svc_rdma_send(rdma, &send_wr);
 	if (ret)
-		svc_rdma_put_context(ctxt, 1);
+		goto err;
 
-	return ret;
+	return 0;
+
+ err:
+	svc_rdma_put_frmr(rdma, vec->frmr);
+	svc_rdma_put_context(ctxt, 1);
+	return -EIO;
 }
 
 void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -477,8 +649,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	ctxt = svc_rdma_get_context(rdma);
 	ctxt->direction = DMA_TO_DEVICE;
 	vec = svc_rdma_get_req_map();
-	xdr_to_sge(rdma, &rqstp->rq_res, vec);
-
+	ret = map_xdr(rdma, &rqstp->rq_res, vec);
+	if (ret)
+		goto err0;
 	inline_bytes = rqstp->rq_res.len;
 
 	/* Create the RDMA response header */
@@ -498,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (ret < 0) {
 		printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
 		       ret);
-		goto error;
+		goto err1;
 	}
 	inline_bytes -= ret;
 
@@ -508,7 +681,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (ret < 0) {
 		printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
 		       ret);
-		goto error;
+		goto err1;
 	}
 	inline_bytes -= ret;
 
@@ -517,9 +690,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	svc_rdma_put_req_map(vec);
 	dprintk("svcrdma: send_reply returns %d\n", ret);
 	return ret;
- error:
+
+ err1:
+	put_page(res_page);
+ err0:
 	svc_rdma_put_req_map(vec);
 	svc_rdma_put_context(ctxt, 0);
-	put_page(res_page);
 	return ret;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fb0dff5e53e..98f945c5a00 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -335,6 +335,8 @@ static void process_context(struct svcxprt_rdma *xprt,
 
 	switch (ctxt->wr_op) {
 	case IB_WR_SEND:
+		if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+			svc_rdma_put_frmr(xprt, ctxt->frmr);
 		svc_rdma_put_context(ctxt, 1);
 		break;
 
-- 
cgit v1.2.3


From 04911b539c9817aa88a6da8f563e65e3e0bc974b Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Mon, 11 Aug 2008 15:14:53 -0500
Subject: svcrdma: Update svc_rdma_send_error to use DMA LKEY

Update the svc_rdma_send_error code to use the DMA LKEY which is valid
regardless of the memory registration strategy in use.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 98f945c5a00..c0cd334fe1c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1314,10 +1314,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
 
 	/* Prepare SGE for local address */
-	atomic_inc(&xprt->sc_dma_used);
 	sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
 				   p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
-	sge.lkey = xprt->sc_phys_mr->lkey;
+	if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
+		put_page(p);
+		return;
+	}
+	atomic_inc(&xprt->sc_dma_used);
+	sge.lkey = xprt->sc_dma_lkey;
 	sge.length = length;
 
 	ctxt = svc_rdma_get_context(xprt);
@@ -1338,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	if (ret) {
 		dprintk("svcrdma: Error %d posting send for protocol error\n",
 			ret);
+		ib_dma_unmap_page(xprt->sc_cm_id->device,
+				  sge.addr, PAGE_SIZE,
+				  DMA_FROM_DEVICE);
 		svc_rdma_put_context(ctxt, 1);
 	}
 }
-- 
cgit v1.2.3


From 67080c82361b7510b602c87b83399421aa2d2895 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Fri, 3 Oct 2008 12:41:14 -0500
Subject: svcrdma: Fix IRD/ORD polarity

The inititator/responder resources in the event have been swapped. They
no represent what the local peer would set their values to in order to
match the peer. Note that iWARP does not exchange these on the wire and
the provider is simply putting in the local device max.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index c0cd334fe1c..6fb493cbd29 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -598,7 +598,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
 		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
 			"event=%d\n", cma_id, cma_id->context, event->event);
 		handle_connect_req(cma_id,
-				   event->param.conn.responder_resources);
+				   event->param.conn.initiator_depth);
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
-- 
cgit v1.2.3


From cb7f6a7b716e801097b564dec3ccb58d330aef56 Mon Sep 17 00:00:00 2001
From: Julius Volz <juliusv@google.com>
Date: Fri, 19 Sep 2008 12:32:57 +0200
Subject: IPVS: Move IPVS to net/netfilter/ipvs

Since IPVS now has partial IPv6 support, this patch moves IPVS from
net/ipv4/ipvs to net/netfilter/ipvs. It's a result of:

$ git mv net/ipv4/ipvs net/netfilter

and adapting the relevant Kconfigs/Makefiles to the new path.

Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/Kconfig                        |    2 -
 net/ipv4/Makefile                       |    1 -
 net/ipv4/ipvs/Kconfig                   |  239 ---
 net/ipv4/ipvs/Makefile                  |   33 -
 net/ipv4/ipvs/ip_vs_app.c               |  622 ------
 net/ipv4/ipvs/ip_vs_conn.c              | 1110 ----------
 net/ipv4/ipvs/ip_vs_core.c              | 1542 --------------
 net/ipv4/ipvs/ip_vs_ctl.c               | 3443 -------------------------------
 net/ipv4/ipvs/ip_vs_est.c               |  166 --
 net/ipv4/ipvs/ip_vs_ftp.c               |  410 ----
 net/ipv4/ipvs/ip_vs_lblc.c              |  555 -----
 net/ipv4/ipvs/ip_vs_lblcr.c             |  755 -------
 net/ipv4/ipvs/ip_vs_lc.c                |  103 -
 net/ipv4/ipvs/ip_vs_nq.c                |  138 --
 net/ipv4/ipvs/ip_vs_proto.c             |  288 ---
 net/ipv4/ipvs/ip_vs_proto_ah_esp.c      |  235 ---
 net/ipv4/ipvs/ip_vs_proto_tcp.c         |  732 -------
 net/ipv4/ipvs/ip_vs_proto_udp.c         |  533 -----
 net/ipv4/ipvs/ip_vs_rr.c                |  112 -
 net/ipv4/ipvs/ip_vs_sched.c             |  251 ---
 net/ipv4/ipvs/ip_vs_sed.c               |  140 --
 net/ipv4/ipvs/ip_vs_sh.c                |  258 ---
 net/ipv4/ipvs/ip_vs_sync.c              |  942 ---------
 net/ipv4/ipvs/ip_vs_wlc.c               |  128 --
 net/ipv4/ipvs/ip_vs_wrr.c               |  237 ---
 net/ipv4/ipvs/ip_vs_xmit.c              | 1004 ---------
 net/netfilter/Kconfig                   |    2 +
 net/netfilter/Makefile                  |    3 +
 net/netfilter/ipvs/Kconfig              |  239 +++
 net/netfilter/ipvs/Makefile             |   33 +
 net/netfilter/ipvs/ip_vs_app.c          |  622 ++++++
 net/netfilter/ipvs/ip_vs_conn.c         | 1110 ++++++++++
 net/netfilter/ipvs/ip_vs_core.c         | 1542 ++++++++++++++
 net/netfilter/ipvs/ip_vs_ctl.c          | 3443 +++++++++++++++++++++++++++++++
 net/netfilter/ipvs/ip_vs_dh.c           |  261 +++
 net/netfilter/ipvs/ip_vs_est.c          |  166 ++
 net/netfilter/ipvs/ip_vs_ftp.c          |  410 ++++
 net/netfilter/ipvs/ip_vs_lblc.c         |  555 +++++
 net/netfilter/ipvs/ip_vs_lblcr.c        |  755 +++++++
 net/netfilter/ipvs/ip_vs_lc.c           |  103 +
 net/netfilter/ipvs/ip_vs_nq.c           |  138 ++
 net/netfilter/ipvs/ip_vs_proto.c        |  288 +++
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c |  235 +++
 net/netfilter/ipvs/ip_vs_proto_tcp.c    |  732 +++++++
 net/netfilter/ipvs/ip_vs_proto_udp.c    |  533 +++++
 net/netfilter/ipvs/ip_vs_rr.c           |  112 +
 net/netfilter/ipvs/ip_vs_sched.c        |  251 +++
 net/netfilter/ipvs/ip_vs_sed.c          |  140 ++
 net/netfilter/ipvs/ip_vs_sh.c           |  258 +++
 net/netfilter/ipvs/ip_vs_sync.c         |  942 +++++++++
 net/netfilter/ipvs/ip_vs_wlc.c          |  128 ++
 net/netfilter/ipvs/ip_vs_wrr.c          |  237 +++
 net/netfilter/ipvs/ip_vs_xmit.c         | 1004 +++++++++
 53 files changed, 14242 insertions(+), 13979 deletions(-)
 delete mode 100644 net/ipv4/ipvs/Kconfig
 delete mode 100644 net/ipv4/ipvs/Makefile
 delete mode 100644 net/ipv4/ipvs/ip_vs_app.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_conn.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_core.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_ctl.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_est.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_ftp.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_lblc.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_lblcr.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_lc.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_nq.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_proto.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_proto_ah_esp.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_proto_tcp.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_proto_udp.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_rr.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_sched.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_sed.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_sh.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_sync.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_wlc.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_wrr.c
 delete mode 100644 net/ipv4/ipvs/ip_vs_xmit.c
 create mode 100644 net/netfilter/ipvs/Kconfig
 create mode 100644 net/netfilter/ipvs/Makefile
 create mode 100644 net/netfilter/ipvs/ip_vs_app.c
 create mode 100644 net/netfilter/ipvs/ip_vs_conn.c
 create mode 100644 net/netfilter/ipvs/ip_vs_core.c
 create mode 100644 net/netfilter/ipvs/ip_vs_ctl.c
 create mode 100644 net/netfilter/ipvs/ip_vs_dh.c
 create mode 100644 net/netfilter/ipvs/ip_vs_est.c
 create mode 100644 net/netfilter/ipvs/ip_vs_ftp.c
 create mode 100644 net/netfilter/ipvs/ip_vs_lblc.c
 create mode 100644 net/netfilter/ipvs/ip_vs_lblcr.c
 create mode 100644 net/netfilter/ipvs/ip_vs_lc.c
 create mode 100644 net/netfilter/ipvs/ip_vs_nq.c
 create mode 100644 net/netfilter/ipvs/ip_vs_proto.c
 create mode 100644 net/netfilter/ipvs/ip_vs_proto_ah_esp.c
 create mode 100644 net/netfilter/ipvs/ip_vs_proto_tcp.c
 create mode 100644 net/netfilter/ipvs/ip_vs_proto_udp.c
 create mode 100644 net/netfilter/ipvs/ip_vs_rr.c
 create mode 100644 net/netfilter/ipvs/ip_vs_sched.c
 create mode 100644 net/netfilter/ipvs/ip_vs_sed.c
 create mode 100644 net/netfilter/ipvs/ip_vs_sh.c
 create mode 100644 net/netfilter/ipvs/ip_vs_sync.c
 create mode 100644 net/netfilter/ipvs/ip_vs_wlc.c
 create mode 100644 net/netfilter/ipvs/ip_vs_wrr.c
 create mode 100644 net/netfilter/ipvs/ip_vs_xmit.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 591ea23639c..691268f3a35 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -630,5 +630,3 @@ config TCP_MD5SIG
 
 	  If unsure, say N.
 
-source "net/ipv4/ipvs/Kconfig"
-
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ad40ef3f9eb..80ff87ce43a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
 obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
-obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
deleted file mode 100644
index de6004de80b..00000000000
--- a/net/ipv4/ipvs/Kconfig
+++ /dev/null
@@ -1,239 +0,0 @@
-#
-# IP Virtual Server configuration
-#
-menuconfig IP_VS
-	tristate "IP virtual server support (EXPERIMENTAL)"
-	depends on NETFILTER
-	---help---
-	  IP Virtual Server support will let you build a high-performance
-	  virtual server based on cluster of two or more real servers. This
-	  option must be enabled for at least one of the clustered computers
-	  that will take care of intercepting incoming connections to a
-	  single IP address and scheduling them to real servers.
-
-	  Three request dispatching techniques are implemented, they are
-	  virtual server via NAT, virtual server via tunneling and virtual
-	  server via direct routing. The several scheduling algorithms can
-	  be used to choose which server the connection is directed to,
-	  thus load balancing can be achieved among the servers.  For more
-	  information and its administration program, please visit the
-	  following URL: <http://www.linuxvirtualserver.org/>.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-if IP_VS
-
-config	IP_VS_IPV6
-	bool "IPv6 support for IPVS (DANGEROUS)"
-	depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
-	---help---
-	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
-
-	  Say N if unsure.
-
-config	IP_VS_DEBUG
-	bool "IP virtual server debugging"
-	---help---
-	  Say Y here if you want to get additional messages useful in
-	  debugging the IP virtual server code. You can change the debug
-	  level in /proc/sys/net/ipv4/vs/debug_level
-
-config	IP_VS_TAB_BITS
-	int "IPVS connection table size (the Nth power of 2)"
-	range 8 20
-	default 12
-	---help---
-	  The IPVS connection hash table uses the chaining scheme to handle
-	  hash collisions. Using a big IPVS connection hash table will greatly
-	  reduce conflicts when there are hundreds of thousands of connections
-	  in the hash table.
-
-	  Note the table size must be power of 2. The table size will be the
-	  value of 2 to the your input number power. The number to choose is
-	  from 8 to 20, the default number is 12, which means the table size
-	  is 4096. Don't input the number too small, otherwise you will lose
-	  performance on it. You can adapt the table size yourself, according
-	  to your virtual server application. It is good to set the table size
-	  not far less than the number of connections per second multiplying
-	  average lasting time of connection in the table.  For example, your
-	  virtual server gets 200 connections per second, the connection lasts
-	  for 200 seconds in average in the connection table, the table size
-	  should be not far less than 200x200, it is good to set the table
-	  size 32768 (2**15).
-
-	  Another note that each connection occupies 128 bytes effectively and
-	  each hash entry uses 8 bytes, so you can estimate how much memory is
-	  needed for your box.
-
-comment "IPVS transport protocol load balancing support"
-
-config	IP_VS_PROTO_TCP
-	bool "TCP load balancing support"
-	---help---
-	  This option enables support for load balancing TCP transport
-	  protocol. Say Y if unsure.
-
-config	IP_VS_PROTO_UDP
-	bool "UDP load balancing support"
-	---help---
-	  This option enables support for load balancing UDP transport
-	  protocol. Say Y if unsure.
-
-config	IP_VS_PROTO_AH_ESP
-	bool
-	depends on UNDEFINED
-
-config	IP_VS_PROTO_ESP
-	bool "ESP load balancing support"
-	select IP_VS_PROTO_AH_ESP
-	---help---
-	  This option enables support for load balancing ESP (Encapsulation
-	  Security Payload) transport protocol. Say Y if unsure.
-
-config	IP_VS_PROTO_AH
-	bool "AH load balancing support"
-	select IP_VS_PROTO_AH_ESP
-	---help---
-	  This option enables support for load balancing AH (Authentication
-	  Header) transport protocol. Say Y if unsure.
-
-comment "IPVS scheduler"
-
-config	IP_VS_RR
-	tristate "round-robin scheduling"
-	---help---
-	  The robin-robin scheduling algorithm simply directs network
-	  connections to different real servers in a round-robin manner.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
- 
-config	IP_VS_WRR
-        tristate "weighted round-robin scheduling" 
-	---help---
-	  The weighted robin-robin scheduling algorithm directs network
-	  connections to different real servers based on server weights
-	  in a round-robin manner. Servers with higher weights receive
-	  new connections first than those with less weights, and servers
-	  with higher weights get more connections than those with less
-	  weights and servers with equal weights get equal connections.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config	IP_VS_LC
-        tristate "least-connection scheduling"
-	---help---
-	  The least-connection scheduling algorithm directs network
-	  connections to the server with the least number of active 
-	  connections.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config	IP_VS_WLC
-        tristate "weighted least-connection scheduling"
-	---help---
-	  The weighted least-connection scheduling algorithm directs network
-	  connections to the server with the least active connections
-	  normalized by the server weight.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config	IP_VS_LBLC
-	tristate "locality-based least-connection scheduling"
-	---help---
-	  The locality-based least-connection scheduling algorithm is for
-	  destination IP load balancing. It is usually used in cache cluster.
-	  This algorithm usually directs packet destined for an IP address to
-	  its server if the server is alive and under load. If the server is
-	  overloaded (its active connection numbers is larger than its weight)
-	  and there is a server in its half load, then allocate the weighted
-	  least-connection server to this IP address.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config  IP_VS_LBLCR
-	tristate "locality-based least-connection with replication scheduling"
-	---help---
-	  The locality-based least-connection with replication scheduling
-	  algorithm is also for destination IP load balancing. It is 
-	  usually used in cache cluster. It differs from the LBLC scheduling
-	  as follows: the load balancer maintains mappings from a target
-	  to a set of server nodes that can serve the target. Requests for
-	  a target are assigned to the least-connection node in the target's
-	  server set. If all the node in the server set are over loaded,
-	  it picks up a least-connection node in the cluster and adds it
-	  in the sever set for the target. If the server set has not been
-	  modified for the specified time, the most loaded node is removed
-	  from the server set, in order to avoid high degree of replication.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config	IP_VS_DH
-	tristate "destination hashing scheduling"
-	---help---
-	  The destination hashing scheduling algorithm assigns network
-	  connections to the servers through looking up a statically assigned
-	  hash table by their destination IP addresses.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config	IP_VS_SH
-	tristate "source hashing scheduling"
-	---help---
-	  The source hashing scheduling algorithm assigns network
-	  connections to the servers through looking up a statically assigned
-	  hash table by their source IP addresses.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config	IP_VS_SED
-	tristate "shortest expected delay scheduling"
-	---help---
-	  The shortest expected delay scheduling algorithm assigns network
-	  connections to the server with the shortest expected delay. The 
-	  expected delay that the job will experience is (Ci + 1) / Ui if 
-	  sent to the ith server, in which Ci is the number of connections
-	  on the ith server and Ui is the fixed service rate (weight)
-	  of the ith server.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-config	IP_VS_NQ
-	tristate "never queue scheduling"
-	---help---
-	  The never queue scheduling algorithm adopts a two-speed model.
-	  When there is an idle server available, the job will be sent to
-	  the idle server, instead of waiting for a fast one. When there
-	  is no idle server available, the job will be sent to the server
-	  that minimize its expected delay (The Shortest Expected Delay
-	  scheduling algorithm).
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-comment 'IPVS application helper'
-
-config	IP_VS_FTP
-  	tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP
-	---help---
-	  FTP is a protocol that transfers IP address and/or port number in
-	  the payload. In the virtual server via Network Address Translation,
-	  the IP address and port number of real servers cannot be sent to
-	  clients in ftp connections directly, so FTP protocol helper is
-	  required for tracking the connection and mangling it back to that of
-	  virtual service.
-
-	  If you want to compile it in kernel, say Y. To compile it as a
-	  module, choose M here. If unsure, say N.
-
-endif # IP_VS
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
deleted file mode 100644
index 73a46fe1fe4..00000000000
--- a/net/ipv4/ipvs/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Makefile for the IPVS modules on top of IPv4.
-#
-
-# IPVS transport protocol load balancing support
-ip_vs_proto-objs-y :=
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
-
-ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
-		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
-		ip_vs_est.o ip_vs_proto.o 				   \
-		$(ip_vs_proto-objs-y)
-
-
-# IPVS core
-obj-$(CONFIG_IP_VS) += ip_vs.o
-
-# IPVS schedulers
-obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
-obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
-obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
-obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
-obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
-obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
-obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
-obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
-obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
-obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
-
-# IPVS application helpers
-obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
deleted file mode 100644
index 201b8ea3020..00000000000
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ /dev/null
@@ -1,622 +0,0 @@
-/*
- * ip_vs_app.c: Application module support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
- * is that ip_vs_app module handles the reverse direction (incoming requests
- * and outgoing responses).
- *
- *		IP_MASQ_APP application masquerading module
- *
- * Author:	Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/net_namespace.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-
-#include <net/ip_vs.h>
-
-EXPORT_SYMBOL(register_ip_vs_app);
-EXPORT_SYMBOL(unregister_ip_vs_app);
-EXPORT_SYMBOL(register_ip_vs_app_inc);
-
-/* ipvs application list head */
-static LIST_HEAD(ip_vs_app_list);
-static DEFINE_MUTEX(__ip_vs_app_mutex);
-
-
-/*
- *	Get an ip_vs_app object
- */
-static inline int ip_vs_app_get(struct ip_vs_app *app)
-{
-	return try_module_get(app->module);
-}
-
-
-static inline void ip_vs_app_put(struct ip_vs_app *app)
-{
-	module_put(app->module);
-}
-
-
-/*
- *	Allocate/initialize app incarnation and register it in proto apps.
- */
-static int
-ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
-{
-	struct ip_vs_protocol *pp;
-	struct ip_vs_app *inc;
-	int ret;
-
-	if (!(pp = ip_vs_proto_get(proto)))
-		return -EPROTONOSUPPORT;
-
-	if (!pp->unregister_app)
-		return -EOPNOTSUPP;
-
-	inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
-	if (!inc)
-		return -ENOMEM;
-	INIT_LIST_HEAD(&inc->p_list);
-	INIT_LIST_HEAD(&inc->incs_list);
-	inc->app = app;
-	inc->port = htons(port);
-	atomic_set(&inc->usecnt, 0);
-
-	if (app->timeouts) {
-		inc->timeout_table =
-			ip_vs_create_timeout_table(app->timeouts,
-						   app->timeouts_size);
-		if (!inc->timeout_table) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	ret = pp->register_app(inc);
-	if (ret)
-		goto out;
-
-	list_add(&inc->a_list, &app->incs_list);
-	IP_VS_DBG(9, "%s application %s:%u registered\n",
-		  pp->name, inc->name, inc->port);
-
-	return 0;
-
-  out:
-	kfree(inc->timeout_table);
-	kfree(inc);
-	return ret;
-}
-
-
-/*
- *	Release app incarnation
- */
-static void
-ip_vs_app_inc_release(struct ip_vs_app *inc)
-{
-	struct ip_vs_protocol *pp;
-
-	if (!(pp = ip_vs_proto_get(inc->protocol)))
-		return;
-
-	if (pp->unregister_app)
-		pp->unregister_app(inc);
-
-	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
-		  pp->name, inc->name, inc->port);
-
-	list_del(&inc->a_list);
-
-	kfree(inc->timeout_table);
-	kfree(inc);
-}
-
-
-/*
- *	Get reference to app inc (only called from softirq)
- *
- */
-int ip_vs_app_inc_get(struct ip_vs_app *inc)
-{
-	int result;
-
-	atomic_inc(&inc->usecnt);
-	if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
-		atomic_dec(&inc->usecnt);
-	return result;
-}
-
-
-/*
- *	Put the app inc (only called from timer or net softirq)
- */
-void ip_vs_app_inc_put(struct ip_vs_app *inc)
-{
-	ip_vs_app_put(inc->app);
-	atomic_dec(&inc->usecnt);
-}
-
-
-/*
- *	Register an application incarnation in protocol applications
- */
-int
-register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
-{
-	int result;
-
-	mutex_lock(&__ip_vs_app_mutex);
-
-	result = ip_vs_app_inc_new(app, proto, port);
-
-	mutex_unlock(&__ip_vs_app_mutex);
-
-	return result;
-}
-
-
-/*
- *	ip_vs_app registration routine
- */
-int register_ip_vs_app(struct ip_vs_app *app)
-{
-	/* increase the module use count */
-	ip_vs_use_count_inc();
-
-	mutex_lock(&__ip_vs_app_mutex);
-
-	list_add(&app->a_list, &ip_vs_app_list);
-
-	mutex_unlock(&__ip_vs_app_mutex);
-
-	return 0;
-}
-
-
-/*
- *	ip_vs_app unregistration routine
- *	We are sure there are no app incarnations attached to services
- */
-void unregister_ip_vs_app(struct ip_vs_app *app)
-{
-	struct ip_vs_app *inc, *nxt;
-
-	mutex_lock(&__ip_vs_app_mutex);
-
-	list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
-		ip_vs_app_inc_release(inc);
-	}
-
-	list_del(&app->a_list);
-
-	mutex_unlock(&__ip_vs_app_mutex);
-
-	/* decrease the module use count */
-	ip_vs_use_count_dec();
-}
-
-
-/*
- *	Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
- */
-int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
-{
-	return pp->app_conn_bind(cp);
-}
-
-
-/*
- *	Unbind cp from application incarnation (called by cp destructor)
- */
-void ip_vs_unbind_app(struct ip_vs_conn *cp)
-{
-	struct ip_vs_app *inc = cp->app;
-
-	if (!inc)
-		return;
-
-	if (inc->unbind_conn)
-		inc->unbind_conn(inc, cp);
-	if (inc->done_conn)
-		inc->done_conn(inc, cp);
-	ip_vs_app_inc_put(inc);
-	cp->app = NULL;
-}
-
-
-/*
- *	Fixes th->seq based on ip_vs_seq info.
- */
-static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
-{
-	__u32 seq = ntohl(th->seq);
-
-	/*
-	 *	Adjust seq with delta-offset for all packets after
-	 *	the most recent resized pkt seq and with previous_delta offset
-	 *	for all packets	before most recent resized pkt seq.
-	 */
-	if (vseq->delta || vseq->previous_delta) {
-		if(after(seq, vseq->init_seq)) {
-			th->seq = htonl(seq + vseq->delta);
-			IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
-				  vseq->delta);
-		} else {
-			th->seq = htonl(seq + vseq->previous_delta);
-			IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
-				  "(%d) to seq\n", vseq->previous_delta);
-		}
-	}
-}
-
-
-/*
- *	Fixes th->ack_seq based on ip_vs_seq info.
- */
-static inline void
-vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
-{
-	__u32 ack_seq = ntohl(th->ack_seq);
-
-	/*
-	 * Adjust ack_seq with delta-offset for
-	 * the packets AFTER most recent resized pkt has caused a shift
-	 * for packets before most recent resized pkt, use previous_delta
-	 */
-	if (vseq->delta || vseq->previous_delta) {
-		/* since ack_seq is the number of octet that is expected
-		   to receive next, so compare it with init_seq+delta */
-		if(after(ack_seq, vseq->init_seq+vseq->delta)) {
-			th->ack_seq = htonl(ack_seq - vseq->delta);
-			IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
-				  "(%d) from ack_seq\n", vseq->delta);
-
-		} else {
-			th->ack_seq = htonl(ack_seq - vseq->previous_delta);
-			IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
-				  "previous_delta (%d) from ack_seq\n",
-				  vseq->previous_delta);
-		}
-	}
-}
-
-
-/*
- *	Updates ip_vs_seq if pkt has been resized
- *	Assumes already checked proto==IPPROTO_TCP and diff!=0.
- */
-static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
-				 unsigned flag, __u32 seq, int diff)
-{
-	/* spinlock is to keep updating cp->flags atomic */
-	spin_lock(&cp->lock);
-	if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
-		vseq->previous_delta = vseq->delta;
-		vseq->delta += diff;
-		vseq->init_seq = seq;
-		cp->flags |= flag;
-	}
-	spin_unlock(&cp->lock);
-}
-
-static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
-				  struct ip_vs_app *app)
-{
-	int diff;
-	const unsigned int tcp_offset = ip_hdrlen(skb);
-	struct tcphdr *th;
-	__u32 seq;
-
-	if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
-		return 0;
-
-	th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
-
-	/*
-	 *	Remember seq number in case this pkt gets resized
-	 */
-	seq = ntohl(th->seq);
-
-	/*
-	 *	Fix seq stuff if flagged as so.
-	 */
-	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
-		vs_fix_seq(&cp->out_seq, th);
-	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
-		vs_fix_ack_seq(&cp->in_seq, th);
-
-	/*
-	 *	Call private output hook function
-	 */
-	if (app->pkt_out == NULL)
-		return 1;
-
-	if (!app->pkt_out(app, cp, skb, &diff))
-		return 0;
-
-	/*
-	 *	Update ip_vs seq stuff if len has changed.
-	 */
-	if (diff != 0)
-		vs_seq_update(cp, &cp->out_seq,
-			      IP_VS_CONN_F_OUT_SEQ, seq, diff);
-
-	return 1;
-}
-
-/*
- *	Output pkt hook. Will call bound ip_vs_app specific function
- *	called by ipvs packet handler, assumes previously checked cp!=NULL
- *	returns false if it can't handle packet (oom)
- */
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-	struct ip_vs_app *app;
-
-	/*
-	 *	check if application module is bound to
-	 *	this ip_vs_conn.
-	 */
-	if ((app = cp->app) == NULL)
-		return 1;
-
-	/* TCP is complicated */
-	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_out(cp, skb, app);
-
-	/*
-	 *	Call private output hook function
-	 */
-	if (app->pkt_out == NULL)
-		return 1;
-
-	return app->pkt_out(app, cp, skb, NULL);
-}
-
-
-static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
-				 struct ip_vs_app *app)
-{
-	int diff;
-	const unsigned int tcp_offset = ip_hdrlen(skb);
-	struct tcphdr *th;
-	__u32 seq;
-
-	if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
-		return 0;
-
-	th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
-
-	/*
-	 *	Remember seq number in case this pkt gets resized
-	 */
-	seq = ntohl(th->seq);
-
-	/*
-	 *	Fix seq stuff if flagged as so.
-	 */
-	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
-		vs_fix_seq(&cp->in_seq, th);
-	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
-		vs_fix_ack_seq(&cp->out_seq, th);
-
-	/*
-	 *	Call private input hook function
-	 */
-	if (app->pkt_in == NULL)
-		return 1;
-
-	if (!app->pkt_in(app, cp, skb, &diff))
-		return 0;
-
-	/*
-	 *	Update ip_vs seq stuff if len has changed.
-	 */
-	if (diff != 0)
-		vs_seq_update(cp, &cp->in_seq,
-			      IP_VS_CONN_F_IN_SEQ, seq, diff);
-
-	return 1;
-}
-
-/*
- *	Input pkt hook. Will call bound ip_vs_app specific function
- *	called by ipvs packet handler, assumes previously checked cp!=NULL.
- *	returns false if can't handle packet (oom).
- */
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-	struct ip_vs_app *app;
-
-	/*
-	 *	check if application module is bound to
-	 *	this ip_vs_conn.
-	 */
-	if ((app = cp->app) == NULL)
-		return 1;
-
-	/* TCP is complicated */
-	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_in(cp, skb, app);
-
-	/*
-	 *	Call private input hook function
-	 */
-	if (app->pkt_in == NULL)
-		return 1;
-
-	return app->pkt_in(app, cp, skb, NULL);
-}
-
-
-#ifdef CONFIG_PROC_FS
-/*
- *	/proc/net/ip_vs_app entry function
- */
-
-static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
-{
-	struct ip_vs_app *app, *inc;
-
-	list_for_each_entry(app, &ip_vs_app_list, a_list) {
-		list_for_each_entry(inc, &app->incs_list, a_list) {
-			if (pos-- == 0)
-				return inc;
-		}
-	}
-	return NULL;
-
-}
-
-static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
-{
-	mutex_lock(&__ip_vs_app_mutex);
-
-	return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
-}
-
-static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct ip_vs_app *inc, *app;
-	struct list_head *e;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return ip_vs_app_idx(0);
-
-	inc = v;
-	app = inc->app;
-
-	if ((e = inc->a_list.next) != &app->incs_list)
-		return list_entry(e, struct ip_vs_app, a_list);
-
-	/* go on to next application */
-	for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
-		app = list_entry(e, struct ip_vs_app, a_list);
-		list_for_each_entry(inc, &app->incs_list, a_list) {
-			return inc;
-		}
-	}
-	return NULL;
-}
-
-static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
-{
-	mutex_unlock(&__ip_vs_app_mutex);
-}
-
-static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
-{
-	if (v == SEQ_START_TOKEN)
-		seq_puts(seq, "prot port    usecnt name\n");
-	else {
-		const struct ip_vs_app *inc = v;
-
-		seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
-			   ip_vs_proto_name(inc->protocol),
-			   ntohs(inc->port),
-			   atomic_read(&inc->usecnt),
-			   inc->name);
-	}
-	return 0;
-}
-
-static const struct seq_operations ip_vs_app_seq_ops = {
-	.start = ip_vs_app_seq_start,
-	.next  = ip_vs_app_seq_next,
-	.stop  = ip_vs_app_seq_stop,
-	.show  = ip_vs_app_seq_show,
-};
-
-static int ip_vs_app_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ip_vs_app_seq_ops);
-}
-
-static const struct file_operations ip_vs_app_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = ip_vs_app_open,
-	.read	 = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-#endif
-
-
-/*
- *	Replace a segment of data with a new segment
- */
-int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
-		      char *o_buf, int o_len, char *n_buf, int n_len)
-{
-	int diff;
-	int o_offset;
-	int o_left;
-
-	EnterFunction(9);
-
-	diff = n_len - o_len;
-	o_offset = o_buf - (char *)skb->data;
-	/* The length of left data after o_buf+o_len in the skb data */
-	o_left = skb->len - (o_offset + o_len);
-
-	if (diff <= 0) {
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-		skb_trim(skb, skb->len + diff);
-	} else if (diff <= skb_tailroom(skb)) {
-		skb_put(skb, diff);
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-	} else {
-		if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
-			return -ENOMEM;
-		skb_put(skb, diff);
-		memmove(skb->data + o_offset + n_len,
-			skb->data + o_offset + o_len, o_left);
-		skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
-	}
-
-	/* must update the iph total length here */
-	ip_hdr(skb)->tot_len = htons(skb->len);
-
-	LeaveFunction(9);
-	return 0;
-}
-
-
-int __init ip_vs_app_init(void)
-{
-	/* we will replace it with proc_net_ipvs_create() soon */
-	proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
-	return 0;
-}
-
-
-void ip_vs_app_cleanup(void)
-{
-	proc_net_remove(&init_net, "ip_vs_app");
-}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
deleted file mode 100644
index 9a24332fbed..00000000000
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the Netfilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
- * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
- * and others. Many code here is taken from IP MASQ code of kernel 2.2.
- *
- * Changes:
- *
- */
-
-#include <linux/interrupt.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/proc_fs.h>		/* for proc_net_* */
-#include <linux/seq_file.h>
-#include <linux/jhash.h>
-#include <linux/random.h>
-
-#include <net/net_namespace.h>
-#include <net/ip_vs.h>
-
-
-/*
- *  Connection hash table: for input and output packets lookups of IPVS
- */
-static struct list_head *ip_vs_conn_tab;
-
-/*  SLAB cache for IPVS connections */
-static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
-
-/*  counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
-/*  counter for no client port connections */
-static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
-
-/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
-
-/*
- *  Fine locking granularity for big connection hash table
- */
-#define CT_LOCKARRAY_BITS  4
-#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
-#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
-
-struct ip_vs_aligned_lock
-{
-	rwlock_t	l;
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
-/* lock array for conn table */
-static struct ip_vs_aligned_lock
-__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-
-static inline void ct_read_lock(unsigned key)
-{
-	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned key)
-{
-	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock(unsigned key)
-{
-	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock(unsigned key)
-{
-	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned key)
-{
-	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned key)
-{
-	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock_bh(unsigned key)
-{
-	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock_bh(unsigned key)
-{
-	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-
-/*
- *	Returns hash value for IPVS connection entry
- */
-static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
-				       const union nf_inet_addr *addr,
-				       __be16 port)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
-				    (__force u32)port, proto, ip_vs_conn_rnd)
-			& IP_VS_CONN_TAB_MASK;
-#endif
-	return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
-			    ip_vs_conn_rnd)
-		& IP_VS_CONN_TAB_MASK;
-}
-
-
-/*
- *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
- *	returns bool success.
- */
-static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
-{
-	unsigned hash;
-	int ret;
-
-	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
-
-	ct_write_lock(hash);
-
-	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
-		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
-		cp->flags |= IP_VS_CONN_F_HASHED;
-		atomic_inc(&cp->refcnt);
-		ret = 1;
-	} else {
-		IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
-			  "called from %p\n", __builtin_return_address(0));
-		ret = 0;
-	}
-
-	ct_write_unlock(hash);
-
-	return ret;
-}
-
-
-/*
- *	UNhashes ip_vs_conn from ip_vs_conn_tab.
- *	returns bool success.
- */
-static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
-{
-	unsigned hash;
-	int ret;
-
-	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
-
-	ct_write_lock(hash);
-
-	if (cp->flags & IP_VS_CONN_F_HASHED) {
-		list_del(&cp->c_list);
-		cp->flags &= ~IP_VS_CONN_F_HASHED;
-		atomic_dec(&cp->refcnt);
-		ret = 1;
-	} else
-		ret = 0;
-
-	ct_write_unlock(hash);
-
-	return ret;
-}
-
-
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from OUTside-to-INside.
- *	s_addr, s_port: pkt source address (foreign host)
- *	d_addr, d_port: pkt dest address (load balancer)
- */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-	unsigned hash;
-	struct ip_vs_conn *cp;
-
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
-
-	ct_read_lock(hash);
-
-	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
-		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    protocol == cp->protocol) {
-			/* HIT */
-			atomic_inc(&cp->refcnt);
-			ct_read_unlock(hash);
-			return cp;
-		}
-	}
-
-	ct_read_unlock(hash);
-
-	return NULL;
-}
-
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-	struct ip_vs_conn *cp;
-
-	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
-	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
-					 d_port);
-
-	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
-		      cp ? "hit" : "not hit");
-
-	return cp;
-}
-
-/* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-	unsigned hash;
-	struct ip_vs_conn *cp;
-
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
-
-	ct_read_lock(hash);
-
-	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
-		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    protocol == cp->protocol) {
-			/* HIT */
-			atomic_inc(&cp->refcnt);
-			goto out;
-		}
-	}
-	cp = NULL;
-
-  out:
-	ct_read_unlock(hash);
-
-	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
-		      cp ? "hit" : "not hit");
-
-	return cp;
-}
-
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from inside-to-OUTside.
- *	s_addr, s_port: pkt source address (inside host)
- *	d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-	unsigned hash;
-	struct ip_vs_conn *cp, *ret=NULL;
-
-	/*
-	 *	Check for "full" addressed entries
-	 */
-	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
-
-	ct_read_lock(hash);
-
-	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
-		    d_port == cp->cport && s_port == cp->dport &&
-		    protocol == cp->protocol) {
-			/* HIT */
-			atomic_inc(&cp->refcnt);
-			ret = cp;
-			break;
-		}
-	}
-
-	ct_read_unlock(hash);
-
-	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
-		      ret ? "hit" : "not hit");
-
-	return ret;
-}
-
-
-/*
- *      Put back the conn and restart its timer with its timeout
- */
-void ip_vs_conn_put(struct ip_vs_conn *cp)
-{
-	/* reset it expire in its timeout */
-	mod_timer(&cp->timer, jiffies+cp->timeout);
-
-	__ip_vs_conn_put(cp);
-}
-
-
-/*
- *	Fill a no_client_port connection with a client port number
- */
-void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
-{
-	if (ip_vs_conn_unhash(cp)) {
-		spin_lock(&cp->lock);
-		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
-			atomic_dec(&ip_vs_conn_no_cport_cnt);
-			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
-			cp->cport = cport;
-		}
-		spin_unlock(&cp->lock);
-
-		/* hash on new dport */
-		ip_vs_conn_hash(cp);
-	}
-}
-
-
-/*
- *	Bind a connection entry with the corresponding packet_xmit.
- *	Called by ip_vs_conn_new.
- */
-static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
-{
-	switch (IP_VS_FWD_METHOD(cp)) {
-	case IP_VS_CONN_F_MASQ:
-		cp->packet_xmit = ip_vs_nat_xmit;
-		break;
-
-	case IP_VS_CONN_F_TUNNEL:
-		cp->packet_xmit = ip_vs_tunnel_xmit;
-		break;
-
-	case IP_VS_CONN_F_DROUTE:
-		cp->packet_xmit = ip_vs_dr_xmit;
-		break;
-
-	case IP_VS_CONN_F_LOCALNODE:
-		cp->packet_xmit = ip_vs_null_xmit;
-		break;
-
-	case IP_VS_CONN_F_BYPASS:
-		cp->packet_xmit = ip_vs_bypass_xmit;
-		break;
-	}
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
-{
-	switch (IP_VS_FWD_METHOD(cp)) {
-	case IP_VS_CONN_F_MASQ:
-		cp->packet_xmit = ip_vs_nat_xmit_v6;
-		break;
-
-	case IP_VS_CONN_F_TUNNEL:
-		cp->packet_xmit = ip_vs_tunnel_xmit_v6;
-		break;
-
-	case IP_VS_CONN_F_DROUTE:
-		cp->packet_xmit = ip_vs_dr_xmit_v6;
-		break;
-
-	case IP_VS_CONN_F_LOCALNODE:
-		cp->packet_xmit = ip_vs_null_xmit;
-		break;
-
-	case IP_VS_CONN_F_BYPASS:
-		cp->packet_xmit = ip_vs_bypass_xmit_v6;
-		break;
-	}
-}
-#endif
-
-
-static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
-{
-	return atomic_read(&dest->activeconns)
-		+ atomic_read(&dest->inactconns);
-}
-
-/*
- *	Bind a connection entry with a virtual service destination
- *	Called just after a new connection entry is created.
- */
-static inline void
-ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
-{
-	/* if dest is NULL, then return directly */
-	if (!dest)
-		return;
-
-	/* Increase the refcnt counter of the dest */
-	atomic_inc(&dest->refcnt);
-
-	/* Bind with the destination and its corresponding transmitter */
-	if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-	    (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
-		/* if the connection is not template and is created
-		 * by sync, preserve the activity flag.
-		 */
-		cp->flags |= atomic_read(&dest->conn_flags) &
-			     (~IP_VS_CONN_F_INACTIVE);
-	else
-		cp->flags |= atomic_read(&dest->conn_flags);
-	cp->dest = dest;
-
-	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
-		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
-		      "dest->refcnt:%d\n",
-		      ip_vs_proto_name(cp->protocol),
-		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
-		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
-		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
-		      ip_vs_fwd_tag(cp), cp->state,
-		      cp->flags, atomic_read(&cp->refcnt),
-		      atomic_read(&dest->refcnt));
-
-	/* Update the connection counters */
-	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
-		/* It is a normal connection, so increase the inactive
-		   connection counter because it is in TCP SYNRECV
-		   state (inactive) or other protocol inacive state */
-		if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-		    (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
-			atomic_inc(&dest->activeconns);
-		else
-			atomic_inc(&dest->inactconns);
-	} else {
-		/* It is a persistent connection/template, so increase
-		   the peristent connection counter */
-		atomic_inc(&dest->persistconns);
-	}
-
-	if (dest->u_threshold != 0 &&
-	    ip_vs_dest_totalconns(dest) >= dest->u_threshold)
-		dest->flags |= IP_VS_DEST_F_OVERLOAD;
-}
-
-
-/*
- * Check if there is a destination for the connection, if so
- * bind the connection to the destination.
- */
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
-{
-	struct ip_vs_dest *dest;
-
-	if ((cp) && (!cp->dest)) {
-		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
-				       &cp->vaddr, cp->vport,
-				       cp->protocol);
-		ip_vs_bind_dest(cp, dest);
-		return dest;
-	} else
-		return NULL;
-}
-
-
-/*
- *	Unbind a connection entry with its VS destination
- *	Called by the ip_vs_conn_expire function.
- */
-static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
-{
-	struct ip_vs_dest *dest = cp->dest;
-
-	if (!dest)
-		return;
-
-	IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
-		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
-		      "dest->refcnt:%d\n",
-		      ip_vs_proto_name(cp->protocol),
-		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
-		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
-		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
-		      ip_vs_fwd_tag(cp), cp->state,
-		      cp->flags, atomic_read(&cp->refcnt),
-		      atomic_read(&dest->refcnt));
-
-	/* Update the connection counters */
-	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
-		/* It is a normal connection, so decrease the inactconns
-		   or activeconns counter */
-		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
-			atomic_dec(&dest->inactconns);
-		} else {
-			atomic_dec(&dest->activeconns);
-		}
-	} else {
-		/* It is a persistent connection/template, so decrease
-		   the peristent connection counter */
-		atomic_dec(&dest->persistconns);
-	}
-
-	if (dest->l_threshold != 0) {
-		if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
-			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-	} else if (dest->u_threshold != 0) {
-		if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
-			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-	} else {
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-	}
-
-	/*
-	 * Simply decrease the refcnt of the dest, because the
-	 * dest will be either in service's destination list
-	 * or in the trash.
-	 */
-	atomic_dec(&dest->refcnt);
-}
-
-
-/*
- *	Checking if the destination of a connection template is available.
- *	If available, return 1, otherwise invalidate this connection
- *	template and return 0.
- */
-int ip_vs_check_template(struct ip_vs_conn *ct)
-{
-	struct ip_vs_dest *dest = ct->dest;
-
-	/*
-	 * Checking the dest server status.
-	 */
-	if ((dest == NULL) ||
-	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
-	    (sysctl_ip_vs_expire_quiescent_template &&
-	     (atomic_read(&dest->weight) == 0))) {
-		IP_VS_DBG_BUF(9, "check_template: dest not available for "
-			      "protocol %s s:%s:%d v:%s:%d "
-			      "-> d:%s:%d\n",
-			      ip_vs_proto_name(ct->protocol),
-			      IP_VS_DBG_ADDR(ct->af, &ct->caddr),
-			      ntohs(ct->cport),
-			      IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
-			      ntohs(ct->vport),
-			      IP_VS_DBG_ADDR(ct->af, &ct->daddr),
-			      ntohs(ct->dport));
-
-		/*
-		 * Invalidate the connection template
-		 */
-		if (ct->vport != htons(0xffff)) {
-			if (ip_vs_conn_unhash(ct)) {
-				ct->dport = htons(0xffff);
-				ct->vport = htons(0xffff);
-				ct->cport = 0;
-				ip_vs_conn_hash(ct);
-			}
-		}
-
-		/*
-		 * Simply decrease the refcnt of the template,
-		 * don't restart its timer.
-		 */
-		atomic_dec(&ct->refcnt);
-		return 0;
-	}
-	return 1;
-}
-
-static void ip_vs_conn_expire(unsigned long data)
-{
-	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
-
-	cp->timeout = 60*HZ;
-
-	/*
-	 *	hey, I'm using it
-	 */
-	atomic_inc(&cp->refcnt);
-
-	/*
-	 *	do I control anybody?
-	 */
-	if (atomic_read(&cp->n_control))
-		goto expire_later;
-
-	/*
-	 *	unhash it if it is hashed in the conn table
-	 */
-	if (!ip_vs_conn_unhash(cp))
-		goto expire_later;
-
-	/*
-	 *	refcnt==1 implies I'm the only one referrer
-	 */
-	if (likely(atomic_read(&cp->refcnt) == 1)) {
-		/* delete the timer if it is activated by other users */
-		if (timer_pending(&cp->timer))
-			del_timer(&cp->timer);
-
-		/* does anybody control me? */
-		if (cp->control)
-			ip_vs_control_del(cp);
-
-		if (unlikely(cp->app != NULL))
-			ip_vs_unbind_app(cp);
-		ip_vs_unbind_dest(cp);
-		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
-			atomic_dec(&ip_vs_conn_no_cport_cnt);
-		atomic_dec(&ip_vs_conn_count);
-
-		kmem_cache_free(ip_vs_conn_cachep, cp);
-		return;
-	}
-
-	/* hash it back to the table */
-	ip_vs_conn_hash(cp);
-
-  expire_later:
-	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
-		  atomic_read(&cp->refcnt)-1,
-		  atomic_read(&cp->n_control));
-
-	ip_vs_conn_put(cp);
-}
-
-
-void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
-{
-	if (del_timer(&cp->timer))
-		mod_timer(&cp->timer, jiffies);
-}
-
-
-/*
- *	Create a new connection entry and hash it into the ip_vs_conn_tab
- */
-struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-	       const union nf_inet_addr *vaddr, __be16 vport,
-	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
-	       struct ip_vs_dest *dest)
-{
-	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
-
-	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
-	if (cp == NULL) {
-		IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
-		return NULL;
-	}
-
-	INIT_LIST_HEAD(&cp->c_list);
-	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
-	cp->af		   = af;
-	cp->protocol	   = proto;
-	ip_vs_addr_copy(af, &cp->caddr, caddr);
-	cp->cport	   = cport;
-	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
-	cp->vport	   = vport;
-	ip_vs_addr_copy(af, &cp->daddr, daddr);
-	cp->dport          = dport;
-	cp->flags	   = flags;
-	spin_lock_init(&cp->lock);
-
-	/*
-	 * Set the entry is referenced by the current thread before hashing
-	 * it in the table, so that other thread run ip_vs_random_dropentry
-	 * but cannot drop this entry.
-	 */
-	atomic_set(&cp->refcnt, 1);
-
-	atomic_set(&cp->n_control, 0);
-	atomic_set(&cp->in_pkts, 0);
-
-	atomic_inc(&ip_vs_conn_count);
-	if (flags & IP_VS_CONN_F_NO_CPORT)
-		atomic_inc(&ip_vs_conn_no_cport_cnt);
-
-	/* Bind the connection with a destination server */
-	ip_vs_bind_dest(cp, dest);
-
-	/* Set its state and timeout */
-	cp->state = 0;
-	cp->timeout = 3*HZ;
-
-	/* Bind its packet transmitter */
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		ip_vs_bind_xmit_v6(cp);
-	else
-#endif
-		ip_vs_bind_xmit(cp);
-
-	if (unlikely(pp && atomic_read(&pp->appcnt)))
-		ip_vs_bind_app(cp, pp);
-
-	/* Hash it in the ip_vs_conn_tab finally */
-	ip_vs_conn_hash(cp);
-
-	return cp;
-}
-
-
-/*
- *	/proc/net/ip_vs_conn entries
- */
-#ifdef CONFIG_PROC_FS
-
-static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
-{
-	int idx;
-	struct ip_vs_conn *cp;
-
-	for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
-		ct_read_lock_bh(idx);
-		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-			if (pos-- == 0) {
-				seq->private = &ip_vs_conn_tab[idx];
-				return cp;
-			}
-		}
-		ct_read_unlock_bh(idx);
-	}
-
-	return NULL;
-}
-
-static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
-{
-	seq->private = NULL;
-	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
-}
-
-static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct ip_vs_conn *cp = v;
-	struct list_head *e, *l = seq->private;
-	int idx;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return ip_vs_conn_array(seq, 0);
-
-	/* more on same hash chain? */
-	if ((e = cp->c_list.next) != l)
-		return list_entry(e, struct ip_vs_conn, c_list);
-
-	idx = l - ip_vs_conn_tab;
-	ct_read_unlock_bh(idx);
-
-	while (++idx < IP_VS_CONN_TAB_SIZE) {
-		ct_read_lock_bh(idx);
-		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-			seq->private = &ip_vs_conn_tab[idx];
-			return cp;
-		}
-		ct_read_unlock_bh(idx);
-	}
-	seq->private = NULL;
-	return NULL;
-}
-
-static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
-{
-	struct list_head *l = seq->private;
-
-	if (l)
-		ct_read_unlock_bh(l - ip_vs_conn_tab);
-}
-
-static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
-{
-
-	if (v == SEQ_START_TOKEN)
-		seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
-	else {
-		const struct ip_vs_conn *cp = v;
-
-#ifdef CONFIG_IP_VS_IPV6
-		if (cp->af == AF_INET6)
-			seq_printf(seq,
-				"%-3s " NIP6_FMT " %04X " NIP6_FMT
-				" %04X " NIP6_FMT " %04X %-11s %7lu\n",
-				ip_vs_proto_name(cp->protocol),
-				NIP6(cp->caddr.in6), ntohs(cp->cport),
-				NIP6(cp->vaddr.in6), ntohs(cp->vport),
-				NIP6(cp->daddr.in6), ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
-		else
-#endif
-			seq_printf(seq,
-				"%-3s %08X %04X %08X %04X"
-				" %08X %04X %-11s %7lu\n",
-				ip_vs_proto_name(cp->protocol),
-				ntohl(cp->caddr.ip), ntohs(cp->cport),
-				ntohl(cp->vaddr.ip), ntohs(cp->vport),
-				ntohl(cp->daddr.ip), ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
-	}
-	return 0;
-}
-
-static const struct seq_operations ip_vs_conn_seq_ops = {
-	.start = ip_vs_conn_seq_start,
-	.next  = ip_vs_conn_seq_next,
-	.stop  = ip_vs_conn_seq_stop,
-	.show  = ip_vs_conn_seq_show,
-};
-
-static int ip_vs_conn_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ip_vs_conn_seq_ops);
-}
-
-static const struct file_operations ip_vs_conn_fops = {
-	.owner	 = THIS_MODULE,
-	.open    = ip_vs_conn_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
-static const char *ip_vs_origin_name(unsigned flags)
-{
-	if (flags & IP_VS_CONN_F_SYNC)
-		return "SYNC";
-	else
-		return "LOCAL";
-}
-
-static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
-{
-
-	if (v == SEQ_START_TOKEN)
-		seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
-	else {
-		const struct ip_vs_conn *cp = v;
-
-#ifdef CONFIG_IP_VS_IPV6
-		if (cp->af == AF_INET6)
-			seq_printf(seq,
-				"%-3s " NIP6_FMT " %04X " NIP6_FMT
-				" %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
-				ip_vs_proto_name(cp->protocol),
-				NIP6(cp->caddr.in6), ntohs(cp->cport),
-				NIP6(cp->vaddr.in6), ntohs(cp->vport),
-				NIP6(cp->daddr.in6), ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
-				ip_vs_origin_name(cp->flags),
-				(cp->timer.expires-jiffies)/HZ);
-		else
-#endif
-			seq_printf(seq,
-				"%-3s %08X %04X %08X %04X "
-				"%08X %04X %-11s %-6s %7lu\n",
-				ip_vs_proto_name(cp->protocol),
-				ntohl(cp->caddr.ip), ntohs(cp->cport),
-				ntohl(cp->vaddr.ip), ntohs(cp->vport),
-				ntohl(cp->daddr.ip), ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
-				ip_vs_origin_name(cp->flags),
-				(cp->timer.expires-jiffies)/HZ);
-	}
-	return 0;
-}
-
-static const struct seq_operations ip_vs_conn_sync_seq_ops = {
-	.start = ip_vs_conn_seq_start,
-	.next  = ip_vs_conn_seq_next,
-	.stop  = ip_vs_conn_seq_stop,
-	.show  = ip_vs_conn_sync_seq_show,
-};
-
-static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ip_vs_conn_sync_seq_ops);
-}
-
-static const struct file_operations ip_vs_conn_sync_fops = {
-	.owner	 = THIS_MODULE,
-	.open    = ip_vs_conn_sync_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
-#endif
-
-
-/*
- *      Randomly drop connection entries before running out of memory
- */
-static inline int todrop_entry(struct ip_vs_conn *cp)
-{
-	/*
-	 * The drop rate array needs tuning for real environments.
-	 * Called from timer bh only => no locking
-	 */
-	static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
-	static char todrop_counter[9] = {0};
-	int i;
-
-	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
-	   This will leave enough time for normal connection to get
-	   through. */
-	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
-		return 0;
-
-	/* Don't drop the entry if its number of incoming packets is not
-	   located in [0, 8] */
-	i = atomic_read(&cp->in_pkts);
-	if (i > 8 || i < 0) return 0;
-
-	if (!todrop_rate[i]) return 0;
-	if (--todrop_counter[i] > 0) return 0;
-
-	todrop_counter[i] = todrop_rate[i];
-	return 1;
-}
-
-/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
-{
-	int idx;
-	struct ip_vs_conn *cp;
-
-	/*
-	 * Randomly scan 1/32 of the whole table every second
-	 */
-	for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
-		unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
-
-		/*
-		 *  Lock is actually needed in this loop.
-		 */
-		ct_write_lock_bh(hash);
-
-		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
-				/* connection template */
-				continue;
-
-			if (cp->protocol == IPPROTO_TCP) {
-				switch(cp->state) {
-				case IP_VS_TCP_S_SYN_RECV:
-				case IP_VS_TCP_S_SYNACK:
-					break;
-
-				case IP_VS_TCP_S_ESTABLISHED:
-					if (todrop_entry(cp))
-						break;
-					continue;
-
-				default:
-					continue;
-				}
-			} else {
-				if (!todrop_entry(cp))
-					continue;
-			}
-
-			IP_VS_DBG(4, "del connection\n");
-			ip_vs_conn_expire_now(cp);
-			if (cp->control) {
-				IP_VS_DBG(4, "del conn template\n");
-				ip_vs_conn_expire_now(cp->control);
-			}
-		}
-		ct_write_unlock_bh(hash);
-	}
-}
-
-
-/*
- *      Flush all the connection entries in the ip_vs_conn_tab
- */
-static void ip_vs_conn_flush(void)
-{
-	int idx;
-	struct ip_vs_conn *cp;
-
-  flush_again:
-	for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
-		/*
-		 *  Lock is actually needed in this loop.
-		 */
-		ct_write_lock_bh(idx);
-
-		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-
-			IP_VS_DBG(4, "del connection\n");
-			ip_vs_conn_expire_now(cp);
-			if (cp->control) {
-				IP_VS_DBG(4, "del conn template\n");
-				ip_vs_conn_expire_now(cp->control);
-			}
-		}
-		ct_write_unlock_bh(idx);
-	}
-
-	/* the counter may be not NULL, because maybe some conn entries
-	   are run by slow timer handler or unhashed but still referred */
-	if (atomic_read(&ip_vs_conn_count) != 0) {
-		schedule();
-		goto flush_again;
-	}
-}
-
-
-int __init ip_vs_conn_init(void)
-{
-	int idx;
-
-	/*
-	 * Allocate the connection hash table and initialize its list heads
-	 */
-	ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
-	if (!ip_vs_conn_tab)
-		return -ENOMEM;
-
-	/* Allocate ip_vs_conn slab cache */
-	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
-					      sizeof(struct ip_vs_conn), 0,
-					      SLAB_HWCACHE_ALIGN, NULL);
-	if (!ip_vs_conn_cachep) {
-		vfree(ip_vs_conn_tab);
-		return -ENOMEM;
-	}
-
-	IP_VS_INFO("Connection hash table configured "
-		   "(size=%d, memory=%ldKbytes)\n",
-		   IP_VS_CONN_TAB_SIZE,
-		   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
-	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
-		  sizeof(struct ip_vs_conn));
-
-	for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
-		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
-	}
-
-	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
-		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
-	}
-
-	proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
-	proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
-
-	/* calculate the random value for connection hash */
-	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
-
-	return 0;
-}
-
-
-void ip_vs_conn_cleanup(void)
-{
-	/* flush all the connection entries first */
-	ip_vs_conn_flush();
-
-	/* Release the empty cache */
-	kmem_cache_destroy(ip_vs_conn_cachep);
-	proc_net_remove(&init_net, "ip_vs_conn");
-	proc_net_remove(&init_net, "ip_vs_conn_sync");
-	vfree(ip_vs_conn_tab);
-}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
deleted file mode 100644
index 958abf3e5f8..00000000000
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ /dev/null
@@ -1,1542 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the Netfilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
- * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
- * and others.
- *
- * Changes:
- *	Paul `Rusty' Russell		properly handle non-linear skbs
- *	Harald Welte			don't use nfcache
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/icmp.h>
-
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <net/icmp.h>                   /* for icmp_send */
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#ifdef CONFIG_IP_VS_IPV6
-#include <net/ipv6.h>
-#include <linux/netfilter_ipv6.h>
-#endif
-
-#include <net/ip_vs.h>
-
-
-EXPORT_SYMBOL(register_ip_vs_scheduler);
-EXPORT_SYMBOL(unregister_ip_vs_scheduler);
-EXPORT_SYMBOL(ip_vs_skb_replace);
-EXPORT_SYMBOL(ip_vs_proto_name);
-EXPORT_SYMBOL(ip_vs_conn_new);
-EXPORT_SYMBOL(ip_vs_conn_in_get);
-EXPORT_SYMBOL(ip_vs_conn_out_get);
-#ifdef CONFIG_IP_VS_PROTO_TCP
-EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
-#endif
-EXPORT_SYMBOL(ip_vs_conn_put);
-#ifdef CONFIG_IP_VS_DEBUG
-EXPORT_SYMBOL(ip_vs_get_debug_level);
-#endif
-
-
-/* ID used in ICMP lookups */
-#define icmp_id(icmph)          (((icmph)->un).echo.id)
-#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
-
-const char *ip_vs_proto_name(unsigned proto)
-{
-	static char buf[20];
-
-	switch (proto) {
-	case IPPROTO_IP:
-		return "IP";
-	case IPPROTO_UDP:
-		return "UDP";
-	case IPPROTO_TCP:
-		return "TCP";
-	case IPPROTO_ICMP:
-		return "ICMP";
-#ifdef CONFIG_IP_VS_IPV6
-	case IPPROTO_ICMPV6:
-		return "ICMPv6";
-#endif
-	default:
-		sprintf(buf, "IP_%d", proto);
-		return buf;
-	}
-}
-
-void ip_vs_init_hash_table(struct list_head *table, int rows)
-{
-	while (--rows >= 0)
-		INIT_LIST_HEAD(&table[rows]);
-}
-
-static inline void
-ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest = cp->dest;
-	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-		spin_lock(&dest->stats.lock);
-		dest->stats.ustats.inpkts++;
-		dest->stats.ustats.inbytes += skb->len;
-		spin_unlock(&dest->stats.lock);
-
-		spin_lock(&dest->svc->stats.lock);
-		dest->svc->stats.ustats.inpkts++;
-		dest->svc->stats.ustats.inbytes += skb->len;
-		spin_unlock(&dest->svc->stats.lock);
-
-		spin_lock(&ip_vs_stats.lock);
-		ip_vs_stats.ustats.inpkts++;
-		ip_vs_stats.ustats.inbytes += skb->len;
-		spin_unlock(&ip_vs_stats.lock);
-	}
-}
-
-
-static inline void
-ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest = cp->dest;
-	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-		spin_lock(&dest->stats.lock);
-		dest->stats.ustats.outpkts++;
-		dest->stats.ustats.outbytes += skb->len;
-		spin_unlock(&dest->stats.lock);
-
-		spin_lock(&dest->svc->stats.lock);
-		dest->svc->stats.ustats.outpkts++;
-		dest->svc->stats.ustats.outbytes += skb->len;
-		spin_unlock(&dest->svc->stats.lock);
-
-		spin_lock(&ip_vs_stats.lock);
-		ip_vs_stats.ustats.outpkts++;
-		ip_vs_stats.ustats.outbytes += skb->len;
-		spin_unlock(&ip_vs_stats.lock);
-	}
-}
-
-
-static inline void
-ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
-{
-	spin_lock(&cp->dest->stats.lock);
-	cp->dest->stats.ustats.conns++;
-	spin_unlock(&cp->dest->stats.lock);
-
-	spin_lock(&svc->stats.lock);
-	svc->stats.ustats.conns++;
-	spin_unlock(&svc->stats.lock);
-
-	spin_lock(&ip_vs_stats.lock);
-	ip_vs_stats.ustats.conns++;
-	spin_unlock(&ip_vs_stats.lock);
-}
-
-
-static inline int
-ip_vs_set_state(struct ip_vs_conn *cp, int direction,
-		const struct sk_buff *skb,
-		struct ip_vs_protocol *pp)
-{
-	if (unlikely(!pp->state_transition))
-		return 0;
-	return pp->state_transition(cp, direction, skb, pp);
-}
-
-
-/*
- *  IPVS persistent scheduling function
- *  It creates a connection entry according to its template if exists,
- *  or selects a server and creates a connection entry plus a template.
- *  Locking: we are svc user (svc->refcnt), so we hold all dests too
- *  Protocols supported: TCP, UDP
- */
-static struct ip_vs_conn *
-ip_vs_sched_persist(struct ip_vs_service *svc,
-		    const struct sk_buff *skb,
-		    __be16 ports[2])
-{
-	struct ip_vs_conn *cp = NULL;
-	struct ip_vs_iphdr iph;
-	struct ip_vs_dest *dest;
-	struct ip_vs_conn *ct;
-	__be16  dport;			/* destination port to forward */
-	union nf_inet_addr snet;	/* source network of the client,
-					   after masking */
-
-	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
-	/* Mask saddr with the netmask to adjust template granularity */
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6)
-		ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
-	else
-#endif
-		snet.ip = iph.saddr.ip & svc->netmask;
-
-	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
-		      "mnet %s\n",
-		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
-		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
-		      IP_VS_DBG_ADDR(svc->af, &snet));
-
-	/*
-	 * As far as we know, FTP is a very complicated network protocol, and
-	 * it uses control connection and data connections. For active FTP,
-	 * FTP server initialize data connection to the client, its source port
-	 * is often 20. For passive FTP, FTP server tells the clients the port
-	 * that it passively listens to,  and the client issues the data
-	 * connection. In the tunneling or direct routing mode, the load
-	 * balancer is on the client-to-server half of connection, the port
-	 * number is unknown to the load balancer. So, a conn template like
-	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
-	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
-	 * is created for other persistent services.
-	 */
-	if (ports[1] == svc->port) {
-		/* Check if a template already exists */
-		if (svc->port != FTPPORT)
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, ports[1]);
-		else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * No template found or the dest of the connection
-			 * template is not available.
-			 */
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
-			}
-
-			/*
-			 * Create a template like <protocol,caddr,0,
-			 * vaddr,vport,daddr,dport> for non-ftp service,
-			 * and <protocol,caddr,0,vaddr,0,daddr,0>
-			 * for ftp service.
-			 */
-			if (svc->port != FTPPORT)
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr,
-						    ports[1],
-						    &dest->addr, dest->port,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
-		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
-		}
-		dport = dest->port;
-	} else {
-		/*
-		 * Note: persistent fwmark-based services and persistent
-		 * port zero service are handled here.
-		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
-		 */
-		if (svc->fwmark) {
-			union nf_inet_addr fwmark = {
-				.all = { 0, 0, 0, htonl(svc->fwmark) }
-			};
-
-			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-					     &fwmark, 0);
-		} else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * If it is not persistent port zero, return NULL,
-			 * otherwise create a connection template.
-			 */
-			if (svc->port)
-				return NULL;
-
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
-			}
-
-			/*
-			 * Create a template according to the service
-			 */
-			if (svc->fwmark) {
-				union nf_inet_addr fwmark = {
-					.all = { 0, 0, 0, htonl(svc->fwmark) }
-				};
-
-				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-						    &snet, 0,
-						    &fwmark, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			} else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
-		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
-		}
-		dport = ports[1];
-	}
-
-	/*
-	 *    Create a new connection according to the template
-	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, ports[0],
-			    &iph.daddr, ports[1],
-			    &dest->addr, dport,
-			    0,
-			    dest);
-	if (cp == NULL) {
-		ip_vs_conn_put(ct);
-		return NULL;
-	}
-
-	/*
-	 *    Add its control
-	 */
-	ip_vs_control_add(cp, ct);
-	ip_vs_conn_put(ct);
-
-	ip_vs_conn_stats(cp, svc);
-	return cp;
-}
-
-
-/*
- *  IPVS main scheduling function
- *  It selects a server according to the virtual service, and
- *  creates a connection entry.
- *  Protocols supported: TCP, UDP
- */
-struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_conn *cp = NULL;
-	struct ip_vs_iphdr iph;
-	struct ip_vs_dest *dest;
-	__be16 _ports[2], *pptr;
-
-	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	/*
-	 *    Persistent service
-	 */
-	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
-		return ip_vs_sched_persist(svc, skb, pptr);
-
-	/*
-	 *    Non-persistent service
-	 */
-	if (!svc->fwmark && pptr[1] != svc->port) {
-		if (!svc->port)
-			IP_VS_ERR("Schedule: port zero only supported "
-				  "in persistent services, "
-				  "check your ipvs configuration\n");
-		return NULL;
-	}
-
-	dest = svc->scheduler->schedule(svc, skb);
-	if (dest == NULL) {
-		IP_VS_DBG(1, "Schedule: no dest found.\n");
-		return NULL;
-	}
-
-	/*
-	 *    Create a connection entry.
-	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, pptr[0],
-			    &iph.daddr, pptr[1],
-			    &dest->addr, dest->port ? dest->port : pptr[1],
-			    0,
-			    dest);
-	if (cp == NULL)
-		return NULL;
-
-	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
-		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
-		      ip_vs_fwd_tag(cp),
-		      IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
-		      IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
-		      IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
-		      cp->flags, atomic_read(&cp->refcnt));
-
-	ip_vs_conn_stats(cp, svc);
-	return cp;
-}
-
-
-/*
- *  Pass or drop the packet.
- *  Called by ip_vs_in, when the virtual service is available but
- *  no destination is available for a new connection.
- */
-int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
-		struct ip_vs_protocol *pp)
-{
-	__be16 _ports[2], *pptr;
-	struct ip_vs_iphdr iph;
-	int unicast;
-	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
-	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
-	if (pptr == NULL) {
-		ip_vs_service_put(svc);
-		return NF_DROP;
-	}
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6)
-		unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
-	else
-#endif
-		unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
-
-	/* if it is fwmark-based service, the cache_bypass sysctl is up
-	   and the destination is a non-local unicast, then create
-	   a cache_bypass connection entry */
-	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
-		int ret, cs;
-		struct ip_vs_conn *cp;
-		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
-
-		ip_vs_service_put(svc);
-
-		/* create a new connection entry */
-		IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
-		cp = ip_vs_conn_new(svc->af, iph.protocol,
-				    &iph.saddr, pptr[0],
-				    &iph.daddr, pptr[1],
-				    &daddr, 0,
-				    IP_VS_CONN_F_BYPASS,
-				    NULL);
-		if (cp == NULL)
-			return NF_DROP;
-
-		/* statistics */
-		ip_vs_in_stats(cp, skb);
-
-		/* set state */
-		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
-
-		/* transmit the first SYN packet */
-		ret = cp->packet_xmit(skb, cp, pp);
-		/* do not touch skb anymore */
-
-		atomic_inc(&cp->in_pkts);
-		ip_vs_conn_put(cp);
-		return ret;
-	}
-
-	/*
-	 * When the virtual ftp service is presented, packets destined
-	 * for other services on the VIP may get here (except services
-	 * listed in the ipvs table), pass the packets, because it is
-	 * not ipvs job to decide to drop the packets.
-	 */
-	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
-		ip_vs_service_put(svc);
-		return NF_ACCEPT;
-	}
-
-	ip_vs_service_put(svc);
-
-	/*
-	 * Notify the client that the destination is unreachable, and
-	 * release the socket buffer.
-	 * Since it is in IP layer, the TCP socket is not actually
-	 * created, the TCP RST packet cannot be sent, instead that
-	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
-	 */
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6)
-		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
-			    skb->dev);
-	else
-#endif
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-
-	return NF_DROP;
-}
-
-
-/*
- *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- *      chain, and is used for VS/NAT.
- *      It detects packets for VS/NAT connections and sends the packets
- *      immediately. This can avoid that iptable_nat mangles the packets
- *      for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
-{
-	if (!skb->ipvs_property)
-		return NF_ACCEPT;
-	/* The packet was sent from IPVS, exit this chain */
-	return NF_STOP;
-}
-
-__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
-{
-	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
-}
-
-static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
-{
-	int err = ip_defrag(skb, user);
-
-	if (!err)
-		ip_send_check(ip_hdr(skb));
-
-	return err;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
-{
-	/* TODO IPv6: Find out what to do here for IPv6 */
-	return 0;
-}
-#endif
-
-/*
- * Packet has been made sufficiently writable in caller
- * - inout: 1=in->out, 0=out->in
- */
-void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
-		    struct ip_vs_conn *cp, int inout)
-{
-	struct iphdr *iph	 = ip_hdr(skb);
-	unsigned int icmp_offset = iph->ihl*4;
-	struct icmphdr *icmph	 = (struct icmphdr *)(skb_network_header(skb) +
-						      icmp_offset);
-	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
-
-	if (inout) {
-		iph->saddr = cp->vaddr.ip;
-		ip_send_check(iph);
-		ciph->daddr = cp->vaddr.ip;
-		ip_send_check(ciph);
-	} else {
-		iph->daddr = cp->daddr.ip;
-		ip_send_check(iph);
-		ciph->saddr = cp->daddr.ip;
-		ip_send_check(ciph);
-	}
-
-	/* the TCP/UDP port */
-	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
-		__be16 *ports = (void *)ciph + ciph->ihl*4;
-
-		if (inout)
-			ports[1] = cp->vport;
-		else
-			ports[0] = cp->dport;
-	}
-
-	/* And finally the ICMP checksum */
-	icmph->checksum = 0;
-	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered outgoing ICMP");
-	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered incoming ICMP");
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
-		    struct ip_vs_conn *cp, int inout)
-{
-	struct ipv6hdr *iph	 = ipv6_hdr(skb);
-	unsigned int icmp_offset = sizeof(struct ipv6hdr);
-	struct icmp6hdr *icmph	 = (struct icmp6hdr *)(skb_network_header(skb) +
-						      icmp_offset);
-	struct ipv6hdr *ciph	 = (struct ipv6hdr *)(icmph + 1);
-
-	if (inout) {
-		iph->saddr = cp->vaddr.in6;
-		ciph->daddr = cp->vaddr.in6;
-	} else {
-		iph->daddr = cp->daddr.in6;
-		ciph->saddr = cp->daddr.in6;
-	}
-
-	/* the TCP/UDP port */
-	if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
-		__be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
-
-		if (inout)
-			ports[1] = cp->vport;
-		else
-			ports[0] = cp->dport;
-	}
-
-	/* And finally the ICMP checksum */
-	icmph->icmp6_cksum = 0;
-	/* TODO IPv6: is this correct for ICMPv6? */
-	ip_vs_checksum_complete(skb, icmp_offset);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered outgoing ICMPv6");
-	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered incoming ICMPv6");
-}
-#endif
-
-/* Handle relevant response ICMP messages - forward to the right
- * destination host. Used for NAT and local client.
- */
-static int handle_response_icmp(int af, struct sk_buff *skb,
-				union nf_inet_addr *snet,
-				__u8 protocol, struct ip_vs_conn *cp,
-				struct ip_vs_protocol *pp,
-				unsigned int offset, unsigned int ihl)
-{
-	unsigned int verdict = NF_DROP;
-
-	if (IP_VS_FWD_METHOD(cp) != 0) {
-		IP_VS_ERR("shouldn't reach here, because the box is on the "
-			  "half connection in the tun/dr module.\n");
-	}
-
-	/* Ensure the checksum is correct */
-	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
-		/* Failed checksum! */
-		IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
-			      IP_VS_DBG_ADDR(af, snet));
-		goto out;
-	}
-
-	if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
-		offset += 2 * sizeof(__u16);
-	if (!skb_make_writable(skb, offset))
-		goto out;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
-	else
-#endif
-		ip_vs_nat_icmp(skb, pp, cp, 1);
-
-	/* do the statistics and put it back */
-	ip_vs_out_stats(cp, skb);
-
-	skb->ipvs_property = 1;
-	verdict = NF_ACCEPT;
-
-out:
-	__ip_vs_conn_put(cp);
-
-	return verdict;
-}
-
-/*
- *	Handle ICMP messages in the inside-to-outside direction (outgoing).
- *	Find any that might be relevant, check against existing connections.
- *	Currently handles error types - unreachable, quench, ttl exceeded.
- */
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
-{
-	struct iphdr *iph;
-	struct icmphdr	_icmph, *ic;
-	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
-	struct ip_vs_iphdr ciph;
-	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp;
-	unsigned int offset, ihl;
-	union nf_inet_addr snet;
-
-	*related = 1;
-
-	/* reassemble IP fragments */
-	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
-			return NF_STOLEN;
-	}
-
-	iph = ip_hdr(skb);
-	offset = ihl = iph->ihl * 4;
-	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-	if (ic == NULL)
-		return NF_DROP;
-
-	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
-		  ic->type, ntohs(icmp_id(ic)),
-		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-
-	/*
-	 * Work through seeing if this is for us.
-	 * These checks are supposed to be in an order that means easy
-	 * things are checked first to speed up processing.... however
-	 * this means that some packets will manage to get a long way
-	 * down this stack and then be rejected, but that's life.
-	 */
-	if ((ic->type != ICMP_DEST_UNREACH) &&
-	    (ic->type != ICMP_SOURCE_QUENCH) &&
-	    (ic->type != ICMP_TIME_EXCEEDED)) {
-		*related = 0;
-		return NF_ACCEPT;
-	}
-
-	/* Now find the contained IP header */
-	offset += sizeof(_icmph);
-	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-	if (cih == NULL)
-		return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-	pp = ip_vs_proto_get(cih->protocol);
-	if (!pp)
-		return NF_ACCEPT;
-
-	/* Is the embedded protocol header present? */
-	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
-		     pp->dont_defrag))
-		return NF_ACCEPT;
-
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
-
-	offset += cih->ihl * 4;
-
-	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
-	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
-	if (!cp)
-		return NF_ACCEPT;
-
-	snet.ip = iph->saddr;
-	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
-				    pp, offset, ihl);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
-{
-	struct ipv6hdr *iph;
-	struct icmp6hdr	_icmph, *ic;
-	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
-					   within the ICMP */
-	struct ip_vs_iphdr ciph;
-	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp;
-	unsigned int offset;
-	union nf_inet_addr snet;
-
-	*related = 1;
-
-	/* reassemble IP fragments */
-	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
-			return NF_STOLEN;
-	}
-
-	iph = ipv6_hdr(skb);
-	offset = sizeof(struct ipv6hdr);
-	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-	if (ic == NULL)
-		return NF_DROP;
-
-	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
-		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
-		  NIP6(iph->saddr), NIP6(iph->daddr));
-
-	/*
-	 * Work through seeing if this is for us.
-	 * These checks are supposed to be in an order that means easy
-	 * things are checked first to speed up processing.... however
-	 * this means that some packets will manage to get a long way
-	 * down this stack and then be rejected, but that's life.
-	 */
-	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
-	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
-	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
-		*related = 0;
-		return NF_ACCEPT;
-	}
-
-	/* Now find the contained IP header */
-	offset += sizeof(_icmph);
-	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-	if (cih == NULL)
-		return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-	pp = ip_vs_proto_get(cih->nexthdr);
-	if (!pp)
-		return NF_ACCEPT;
-
-	/* Is the embedded protocol header present? */
-	/* TODO: we don't support fragmentation at the moment anyways */
-	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
-		return NF_ACCEPT;
-
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
-
-	offset += sizeof(struct ipv6hdr);
-
-	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
-	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
-	if (!cp)
-		return NF_ACCEPT;
-
-	ipv6_addr_copy(&snet.in6, &iph->saddr);
-	return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
-				    pp, offset, sizeof(struct ipv6hdr));
-}
-#endif
-
-static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
-{
-	struct tcphdr _tcph, *th;
-
-	th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
-	if (th == NULL)
-		return 0;
-	return th->rst;
-}
-
-/* Handle response packets: rewrite addresses and send away...
- * Used for NAT and local client.
- */
-static unsigned int
-handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-		struct ip_vs_conn *cp, int ihl)
-{
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
-
-	if (!skb_make_writable(skb, ihl))
-		goto drop;
-
-	/* mangle the packet */
-	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
-		goto drop;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
-	else
-#endif
-	{
-		ip_hdr(skb)->saddr = cp->vaddr.ip;
-		ip_send_check(ip_hdr(skb));
-	}
-
-	/* For policy routing, packets originating from this
-	 * machine itself may be routed differently to packets
-	 * passing through.  We want this packet to be routed as
-	 * if it came from this machine itself.  So re-compute
-	 * the routing information.
-	 */
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6) {
-		if (ip6_route_me_harder(skb) != 0)
-			goto drop;
-	} else
-#endif
-		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-			goto drop;
-
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
-
-	ip_vs_out_stats(cp, skb);
-	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-	ip_vs_conn_put(cp);
-
-	skb->ipvs_property = 1;
-
-	LeaveFunction(11);
-	return NF_ACCEPT;
-
-drop:
-	ip_vs_conn_put(cp);
-	kfree_skb(skb);
-	return NF_STOLEN;
-}
-
-/*
- *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
- *	Check if outgoing packet belongs to the established ip_vs_conn.
- */
-static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
-	  const struct net_device *in, const struct net_device *out,
-	  int (*okfn)(struct sk_buff *))
-{
-	struct ip_vs_iphdr iph;
-	struct ip_vs_protocol *pp;
-	struct ip_vs_conn *cp;
-	int af;
-
-	EnterFunction(11);
-
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-	if (skb->ipvs_property)
-		return NF_ACCEPT;
-
-	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6) {
-		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-			if (related)
-				return verdict;
-			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-		}
-	} else
-#endif
-		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-			int related, verdict = ip_vs_out_icmp(skb, &related);
-
-			if (related)
-				return verdict;
-			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-		}
-
-	pp = ip_vs_proto_get(iph.protocol);
-	if (unlikely(!pp))
-		return NF_ACCEPT;
-
-	/* reassemble IP fragments */
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6) {
-		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-			if (related)
-				return verdict;
-
-			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-		}
-	} else
-#endif
-		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
-			     !pp->dont_defrag)) {
-			if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
-				return NF_STOLEN;
-
-			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-		}
-
-	/*
-	 * Check if the packet belongs to an existing entry
-	 */
-	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-
-	if (unlikely(!cp)) {
-		if (sysctl_ip_vs_nat_icmp_send &&
-		    (pp->protocol == IPPROTO_TCP ||
-		     pp->protocol == IPPROTO_UDP)) {
-			__be16 _ports[2], *pptr;
-
-			pptr = skb_header_pointer(skb, iph.len,
-						  sizeof(_ports), _ports);
-			if (pptr == NULL)
-				return NF_ACCEPT;	/* Not for me */
-			if (ip_vs_lookup_real_service(af, iph.protocol,
-						      &iph.saddr,
-						      pptr[0])) {
-				/*
-				 * Notify the real server: there is no
-				 * existing entry if it is not RST
-				 * packet or not TCP packet.
-				 */
-				if (iph.protocol != IPPROTO_TCP
-				    || !is_tcp_reset(skb, iph.len)) {
-#ifdef CONFIG_IP_VS_IPV6
-					if (af == AF_INET6)
-						icmpv6_send(skb,
-							    ICMPV6_DEST_UNREACH,
-							    ICMPV6_PORT_UNREACH,
-							    0, skb->dev);
-					else
-#endif
-						icmp_send(skb,
-							  ICMP_DEST_UNREACH,
-							  ICMP_PORT_UNREACH, 0);
-					return NF_DROP;
-				}
-			}
-		}
-		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
-		return NF_ACCEPT;
-	}
-
-	return handle_response(af, skb, pp, cp, iph.len);
-}
-
-
-/*
- *	Handle ICMP messages in the outside-to-inside direction (incoming).
- *	Find any that might be relevant, check against existing connections,
- *	forward to the right destination host if relevant.
- *	Currently handles error types - unreachable, quench, ttl exceeded.
- */
-static int
-ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
-{
-	struct iphdr *iph;
-	struct icmphdr	_icmph, *ic;
-	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
-	struct ip_vs_iphdr ciph;
-	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp;
-	unsigned int offset, ihl, verdict;
-	union nf_inet_addr snet;
-
-	*related = 1;
-
-	/* reassemble IP fragments */
-	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
-					    IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
-			return NF_STOLEN;
-	}
-
-	iph = ip_hdr(skb);
-	offset = ihl = iph->ihl * 4;
-	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-	if (ic == NULL)
-		return NF_DROP;
-
-	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
-		  ic->type, ntohs(icmp_id(ic)),
-		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-
-	/*
-	 * Work through seeing if this is for us.
-	 * These checks are supposed to be in an order that means easy
-	 * things are checked first to speed up processing.... however
-	 * this means that some packets will manage to get a long way
-	 * down this stack and then be rejected, but that's life.
-	 */
-	if ((ic->type != ICMP_DEST_UNREACH) &&
-	    (ic->type != ICMP_SOURCE_QUENCH) &&
-	    (ic->type != ICMP_TIME_EXCEEDED)) {
-		*related = 0;
-		return NF_ACCEPT;
-	}
-
-	/* Now find the contained IP header */
-	offset += sizeof(_icmph);
-	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-	if (cih == NULL)
-		return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-	pp = ip_vs_proto_get(cih->protocol);
-	if (!pp)
-		return NF_ACCEPT;
-
-	/* Is the embedded protocol header present? */
-	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
-		     pp->dont_defrag))
-		return NF_ACCEPT;
-
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
-
-	offset += cih->ihl * 4;
-
-	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
-	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
-	if (!cp) {
-		/* The packet could also belong to a local client */
-		cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
-		if (cp) {
-			snet.ip = iph->saddr;
-			return handle_response_icmp(AF_INET, skb, &snet,
-						    cih->protocol, cp, pp,
-						    offset, ihl);
-		}
-		return NF_ACCEPT;
-	}
-
-	verdict = NF_DROP;
-
-	/* Ensure the checksum is correct */
-	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
-		/* Failed checksum! */
-		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
-			  NIPQUAD(iph->saddr));
-		goto out;
-	}
-
-	/* do the statistics and put it back */
-	ip_vs_in_stats(cp, skb);
-	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
-		offset += 2 * sizeof(__u16);
-	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
-	/* do not touch skb anymore */
-
-  out:
-	__ip_vs_conn_put(cp);
-
-	return verdict;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static int
-ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
-{
-	struct ipv6hdr *iph;
-	struct icmp6hdr	_icmph, *ic;
-	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
-					   within the ICMP */
-	struct ip_vs_iphdr ciph;
-	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp;
-	unsigned int offset, verdict;
-	union nf_inet_addr snet;
-
-	*related = 1;
-
-	/* reassemble IP fragments */
-	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
-					       IP_DEFRAG_VS_IN :
-					       IP_DEFRAG_VS_FWD))
-			return NF_STOLEN;
-	}
-
-	iph = ipv6_hdr(skb);
-	offset = sizeof(struct ipv6hdr);
-	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-	if (ic == NULL)
-		return NF_DROP;
-
-	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
-		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
-		  NIP6(iph->saddr), NIP6(iph->daddr));
-
-	/*
-	 * Work through seeing if this is for us.
-	 * These checks are supposed to be in an order that means easy
-	 * things are checked first to speed up processing.... however
-	 * this means that some packets will manage to get a long way
-	 * down this stack and then be rejected, but that's life.
-	 */
-	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
-	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
-	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
-		*related = 0;
-		return NF_ACCEPT;
-	}
-
-	/* Now find the contained IP header */
-	offset += sizeof(_icmph);
-	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-	if (cih == NULL)
-		return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-	pp = ip_vs_proto_get(cih->nexthdr);
-	if (!pp)
-		return NF_ACCEPT;
-
-	/* Is the embedded protocol header present? */
-	/* TODO: we don't support fragmentation at the moment anyways */
-	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
-		return NF_ACCEPT;
-
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
-
-	offset += sizeof(struct ipv6hdr);
-
-	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
-	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
-	if (!cp) {
-		/* The packet could also belong to a local client */
-		cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
-		if (cp) {
-			ipv6_addr_copy(&snet.in6, &iph->saddr);
-			return handle_response_icmp(AF_INET6, skb, &snet,
-						    cih->nexthdr,
-						    cp, pp, offset,
-						    sizeof(struct ipv6hdr));
-		}
-		return NF_ACCEPT;
-	}
-
-	verdict = NF_DROP;
-
-	/* do the statistics and put it back */
-	ip_vs_in_stats(cp, skb);
-	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
-		offset += 2 * sizeof(__u16);
-	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
-	/* do not touch skb anymore */
-
-	__ip_vs_conn_put(cp);
-
-	return verdict;
-}
-#endif
-
-
-/*
- *	Check if it's for virtual services, look it up,
- *	and send it on its way...
- */
-static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-	 const struct net_device *in, const struct net_device *out,
-	 int (*okfn)(struct sk_buff *))
-{
-	struct ip_vs_iphdr iph;
-	struct ip_vs_protocol *pp;
-	struct ip_vs_conn *cp;
-	int ret, restart, af;
-
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
-	/*
-	 *	Big tappo: only PACKET_HOST, including loopback for local client
-	 *	Don't handle local packets on IPv6 for now
-	 */
-	if (unlikely(skb->pkt_type != PACKET_HOST)) {
-		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-			      skb->pkt_type,
-			      iph.protocol,
-			      IP_VS_DBG_ADDR(af, &iph.daddr));
-		return NF_ACCEPT;
-	}
-
-	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-		int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
-
-		if (related)
-			return verdict;
-		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-	}
-
-	/* Protocol supported? */
-	pp = ip_vs_proto_get(iph.protocol);
-	if (unlikely(!pp))
-		return NF_ACCEPT;
-
-	/*
-	 * Check if the packet belongs to an existing connection entry
-	 */
-	cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
-
-	if (unlikely(!cp)) {
-		int v;
-
-		/* For local client packets, it could be a response */
-		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-		if (cp)
-			return handle_response(af, skb, pp, cp, iph.len);
-
-		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
-			return v;
-	}
-
-	if (unlikely(!cp)) {
-		/* sorry, all this trouble for a no-hit :) */
-		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
-		return NF_ACCEPT;
-	}
-
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
-
-	/* Check the server status */
-	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-		/* the destination server is not available */
-
-		if (sysctl_ip_vs_expire_nodest_conn) {
-			/* try to expire the connection immediately */
-			ip_vs_conn_expire_now(cp);
-		}
-		/* don't restart its timer, and silently
-		   drop the packet. */
-		__ip_vs_conn_put(cp);
-		return NF_DROP;
-	}
-
-	ip_vs_in_stats(cp, skb);
-	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
-	if (cp->packet_xmit)
-		ret = cp->packet_xmit(skb, cp, pp);
-		/* do not touch skb anymore */
-	else {
-		IP_VS_DBG_RL("warning: packet_xmit is null");
-		ret = NF_ACCEPT;
-	}
-
-	/* Increase its packet counter and check if it is needed
-	 * to be synchronized
-	 *
-	 * Sync connection if it is about to close to
-	 * encorage the standby servers to update the connections timeout
-	 */
-	atomic_inc(&cp->in_pkts);
-	if (af == AF_INET &&
-	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
-	    (((cp->protocol != IPPROTO_TCP ||
-	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
-	      (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
-	       == sysctl_ip_vs_sync_threshold[0])) ||
-	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
-	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
-	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
-	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
-		ip_vs_sync_conn(cp);
-	cp->old_state = cp->state;
-
-	ip_vs_conn_put(cp);
-	return ret;
-}
-
-
-/*
- *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
- *      related packets destined for 0.0.0.0/0.
- *      When fwmark-based virtual service is used, such as transparent
- *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
- *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
- *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
- *      and send them to ip_vs_in_icmp.
- */
-static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
-		   const struct net_device *in, const struct net_device *out,
-		   int (*okfn)(struct sk_buff *))
-{
-	int r;
-
-	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
-		return NF_ACCEPT;
-
-	return ip_vs_in_icmp(skb, &r, hooknum);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static unsigned int
-ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
-		      const struct net_device *in, const struct net_device *out,
-		      int (*okfn)(struct sk_buff *))
-{
-	int r;
-
-	if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
-		return NF_ACCEPT;
-
-	return ip_vs_in_icmp_v6(skb, &r, hooknum);
-}
-#endif
-
-
-static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
-	/* After packet filtering, forward packet through VS/DR, VS/TUN,
-	 * or VS/NAT(change destination), so that filtering rules can be
-	 * applied to IPVS. */
-	{
-		.hook		= ip_vs_in,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
-	},
-	/* After packet filtering, change source only for VS/NAT */
-	{
-		.hook		= ip_vs_out,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
-	},
-	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
-	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
-	{
-		.hook		= ip_vs_forward_icmp,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
-	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP_PRI_NAT_SRC-1,
-	},
-#ifdef CONFIG_IP_VS_IPV6
-	/* After packet filtering, forward packet through VS/DR, VS/TUN,
-	 * or VS/NAT(change destination), so that filtering rules can be
-	 * applied to IPVS. */
-	{
-		.hook		= ip_vs_in,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
-	},
-	/* After packet filtering, change source only for VS/NAT */
-	{
-		.hook		= ip_vs_out,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
-	},
-	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
-	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
-	{
-		.hook		= ip_vs_forward_icmp_v6,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
-	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP6_PRI_NAT_SRC-1,
-	},
-#endif
-};
-
-
-/*
- *	Initialize IP Virtual Server
- */
-static int __init ip_vs_init(void)
-{
-	int ret;
-
-	ip_vs_estimator_init();
-
-	ret = ip_vs_control_init();
-	if (ret < 0) {
-		IP_VS_ERR("can't setup control.\n");
-		goto cleanup_estimator;
-	}
-
-	ip_vs_protocol_init();
-
-	ret = ip_vs_app_init();
-	if (ret < 0) {
-		IP_VS_ERR("can't setup application helper.\n");
-		goto cleanup_protocol;
-	}
-
-	ret = ip_vs_conn_init();
-	if (ret < 0) {
-		IP_VS_ERR("can't setup connection table.\n");
-		goto cleanup_app;
-	}
-
-	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
-	if (ret < 0) {
-		IP_VS_ERR("can't register hooks.\n");
-		goto cleanup_conn;
-	}
-
-	IP_VS_INFO("ipvs loaded.\n");
-	return ret;
-
-  cleanup_conn:
-	ip_vs_conn_cleanup();
-  cleanup_app:
-	ip_vs_app_cleanup();
-  cleanup_protocol:
-	ip_vs_protocol_cleanup();
-	ip_vs_control_cleanup();
-  cleanup_estimator:
-	ip_vs_estimator_cleanup();
-	return ret;
-}
-
-static void __exit ip_vs_cleanup(void)
-{
-	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
-	ip_vs_conn_cleanup();
-	ip_vs_app_cleanup();
-	ip_vs_protocol_cleanup();
-	ip_vs_control_cleanup();
-	ip_vs_estimator_cleanup();
-	IP_VS_INFO("ipvs unloaded.\n");
-}
-
-module_init(ip_vs_init);
-module_exit(ip_vs_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
deleted file mode 100644
index 0302cf3e503..00000000000
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ /dev/null
@@ -1,3443 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the NetFilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/swap.h>
-#include <linux/seq_file.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/mutex.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#ifdef CONFIG_IP_VS_IPV6
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#endif
-#include <net/route.h>
-#include <net/sock.h>
-#include <net/genetlink.h>
-
-#include <asm/uaccess.h>
-
-#include <net/ip_vs.h>
-
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
-
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
-/* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-
-
-#ifdef CONFIG_IP_VS_DEBUG
-static int sysctl_ip_vs_debug_level = 0;
-
-int ip_vs_get_debug_level(void)
-{
-	return sysctl_ip_vs_debug_level;
-}
-#endif
-
-#ifdef CONFIG_IP_VS_IPV6
-/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
-{
-	struct rt6_info *rt;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = *addr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};
-
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
-			return 1;
-
-	return 0;
-}
-#endif
-/*
- *	update_defense_level is called from keventd and from sysctl,
- *	so it needs to protect itself from softirqs
- */
-static void update_defense_level(void)
-{
-	struct sysinfo i;
-	static int old_secure_tcp = 0;
-	int availmem;
-	int nomem;
-	int to_change = -1;
-
-	/* we only count free and buffered memory (in pages) */
-	si_meminfo(&i);
-	availmem = i.freeram + i.bufferram;
-	/* however in linux 2.5 the i.bufferram is total page cache size,
-	   we need adjust it */
-	/* si_swapinfo(&i); */
-	/* availmem = availmem - (i.totalswap - i.freeswap); */
-
-	nomem = (availmem < sysctl_ip_vs_amemthresh);
-
-	local_bh_disable();
-
-	/* drop_entry */
-	spin_lock(&__ip_vs_dropentry_lock);
-	switch (sysctl_ip_vs_drop_entry) {
-	case 0:
-		atomic_set(&ip_vs_dropentry, 0);
-		break;
-	case 1:
-		if (nomem) {
-			atomic_set(&ip_vs_dropentry, 1);
-			sysctl_ip_vs_drop_entry = 2;
-		} else {
-			atomic_set(&ip_vs_dropentry, 0);
-		}
-		break;
-	case 2:
-		if (nomem) {
-			atomic_set(&ip_vs_dropentry, 1);
-		} else {
-			atomic_set(&ip_vs_dropentry, 0);
-			sysctl_ip_vs_drop_entry = 1;
-		};
-		break;
-	case 3:
-		atomic_set(&ip_vs_dropentry, 1);
-		break;
-	}
-	spin_unlock(&__ip_vs_dropentry_lock);
-
-	/* drop_packet */
-	spin_lock(&__ip_vs_droppacket_lock);
-	switch (sysctl_ip_vs_drop_packet) {
-	case 0:
-		ip_vs_drop_rate = 0;
-		break;
-	case 1:
-		if (nomem) {
-			ip_vs_drop_rate = ip_vs_drop_counter
-				= sysctl_ip_vs_amemthresh /
-				(sysctl_ip_vs_amemthresh-availmem);
-			sysctl_ip_vs_drop_packet = 2;
-		} else {
-			ip_vs_drop_rate = 0;
-		}
-		break;
-	case 2:
-		if (nomem) {
-			ip_vs_drop_rate = ip_vs_drop_counter
-				= sysctl_ip_vs_amemthresh /
-				(sysctl_ip_vs_amemthresh-availmem);
-		} else {
-			ip_vs_drop_rate = 0;
-			sysctl_ip_vs_drop_packet = 1;
-		}
-		break;
-	case 3:
-		ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
-		break;
-	}
-	spin_unlock(&__ip_vs_droppacket_lock);
-
-	/* secure_tcp */
-	write_lock(&__ip_vs_securetcp_lock);
-	switch (sysctl_ip_vs_secure_tcp) {
-	case 0:
-		if (old_secure_tcp >= 2)
-			to_change = 0;
-		break;
-	case 1:
-		if (nomem) {
-			if (old_secure_tcp < 2)
-				to_change = 1;
-			sysctl_ip_vs_secure_tcp = 2;
-		} else {
-			if (old_secure_tcp >= 2)
-				to_change = 0;
-		}
-		break;
-	case 2:
-		if (nomem) {
-			if (old_secure_tcp < 2)
-				to_change = 1;
-		} else {
-			if (old_secure_tcp >= 2)
-				to_change = 0;
-			sysctl_ip_vs_secure_tcp = 1;
-		}
-		break;
-	case 3:
-		if (old_secure_tcp < 2)
-			to_change = 1;
-		break;
-	}
-	old_secure_tcp = sysctl_ip_vs_secure_tcp;
-	if (to_change >= 0)
-		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
-	write_unlock(&__ip_vs_securetcp_lock);
-
-	local_bh_enable();
-}
-
-
-/*
- *	Timer for checking the defense
- */
-#define DEFENSE_TIMER_PERIOD	1*HZ
-static void defense_work_handler(struct work_struct *work);
-static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
-
-static void defense_work_handler(struct work_struct *work)
-{
-	update_defense_level();
-	if (atomic_read(&ip_vs_dropentry))
-		ip_vs_random_dropentry();
-
-	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-}
-
-int
-ip_vs_use_count_inc(void)
-{
-	return try_module_get(THIS_MODULE);
-}
-
-void
-ip_vs_use_count_dec(void)
-{
-	module_put(THIS_MODULE);
-}
-
-
-/*
- *	Hash table: for virtual service lookups
- */
-#define IP_VS_SVC_TAB_BITS 8
-#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
-#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
-
-/* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
-/* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
-
-/*
- *	Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
- *	Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
- *	FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
-
-
-/*
- *	Returns hash value for virtual service
- */
-static __inline__ unsigned
-ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
-		  __be16 port)
-{
-	register unsigned porth = ntohs(port);
-	__be32 addr_fold = addr->ip;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		addr_fold = addr->ip6[0]^addr->ip6[1]^
-			    addr->ip6[2]^addr->ip6[3];
-#endif
-
-	return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
-		& IP_VS_SVC_TAB_MASK;
-}
-
-/*
- *	Returns hash value of fwmark for virtual service lookup
- */
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
-{
-	return fwmark & IP_VS_SVC_TAB_MASK;
-}
-
-/*
- *	Hashes a service in the ip_vs_svc_table by <proto,addr,port>
- *	or in the ip_vs_svc_fwm_table by fwmark.
- *	Should be called with locked tables.
- */
-static int ip_vs_svc_hash(struct ip_vs_service *svc)
-{
-	unsigned hash;
-
-	if (svc->flags & IP_VS_SVC_F_HASHED) {
-		IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
-			  "called from %p\n", __builtin_return_address(0));
-		return 0;
-	}
-
-	if (svc->fwmark == 0) {
-		/*
-		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
-		 */
-		hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
-					 svc->port);
-		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
-	} else {
-		/*
-		 *  Hash it by fwmark in ip_vs_svc_fwm_table
-		 */
-		hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
-		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
-	}
-
-	svc->flags |= IP_VS_SVC_F_HASHED;
-	/* increase its refcnt because it is referenced by the svc table */
-	atomic_inc(&svc->refcnt);
-	return 1;
-}
-
-
-/*
- *	Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
- *	Should be called with locked tables.
- */
-static int ip_vs_svc_unhash(struct ip_vs_service *svc)
-{
-	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
-		IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
-			  "called from %p\n", __builtin_return_address(0));
-		return 0;
-	}
-
-	if (svc->fwmark == 0) {
-		/* Remove it from the ip_vs_svc_table table */
-		list_del(&svc->s_list);
-	} else {
-		/* Remove it from the ip_vs_svc_fwm_table table */
-		list_del(&svc->f_list);
-	}
-
-	svc->flags &= ~IP_VS_SVC_F_HASHED;
-	atomic_dec(&svc->refcnt);
-	return 1;
-}
-
-
-/*
- *	Get service by {proto,addr,port} in the service table.
- */
-static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
-		    __be16 vport)
-{
-	unsigned hash;
-	struct ip_vs_service *svc;
-
-	/* Check for "full" addressed entries */
-	hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
-
-	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
-		if ((svc->af == af)
-		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
-		    && (svc->port == vport)
-		    && (svc->protocol == protocol)) {
-			/* HIT */
-			atomic_inc(&svc->usecnt);
-			return svc;
-		}
-	}
-
-	return NULL;
-}
-
-
-/*
- *	Get service by {fwmark} in the service table.
- */
-static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
-{
-	unsigned hash;
-	struct ip_vs_service *svc;
-
-	/* Check for fwmark addressed entries */
-	hash = ip_vs_svc_fwm_hashkey(fwmark);
-
-	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
-		if (svc->fwmark == fwmark && svc->af == af) {
-			/* HIT */
-			atomic_inc(&svc->usecnt);
-			return svc;
-		}
-	}
-
-	return NULL;
-}
-
-struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
-		  const union nf_inet_addr *vaddr, __be16 vport)
-{
-	struct ip_vs_service *svc;
-
-	read_lock(&__ip_vs_svc_lock);
-
-	/*
-	 *	Check the table hashed by fwmark first
-	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
-		goto out;
-
-	/*
-	 *	Check the table hashed by <protocol,addr,port>
-	 *	for "full" addressed entries
-	 */
-	svc = __ip_vs_service_get(af, protocol, vaddr, vport);
-
-	if (svc == NULL
-	    && protocol == IPPROTO_TCP
-	    && atomic_read(&ip_vs_ftpsvc_counter)
-	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
-		/*
-		 * Check if ftp service entry exists, the packet
-		 * might belong to FTP data connections.
-		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
-	}
-
-	if (svc == NULL
-	    && atomic_read(&ip_vs_nullsvc_counter)) {
-		/*
-		 * Check if the catch-all port (port zero) exists
-		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, 0);
-	}
-
-  out:
-	read_unlock(&__ip_vs_svc_lock);
-
-	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
-		      fwmark, ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
-		      svc ? "hit" : "not hit");
-
-	return svc;
-}
-
-
-static inline void
-__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
-	atomic_inc(&svc->refcnt);
-	dest->svc = svc;
-}
-
-static inline void
-__ip_vs_unbind_svc(struct ip_vs_dest *dest)
-{
-	struct ip_vs_service *svc = dest->svc;
-
-	dest->svc = NULL;
-	if (atomic_dec_and_test(&svc->refcnt))
-		kfree(svc);
-}
-
-
-/*
- *	Returns hash value for real service
- */
-static inline unsigned ip_vs_rs_hashkey(int af,
-					    const union nf_inet_addr *addr,
-					    __be16 port)
-{
-	register unsigned porth = ntohs(port);
-	__be32 addr_fold = addr->ip;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		addr_fold = addr->ip6[0]^addr->ip6[1]^
-			    addr->ip6[2]^addr->ip6[3];
-#endif
-
-	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
-		& IP_VS_RTAB_MASK;
-}
-
-/*
- *	Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
- *	should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
-{
-	unsigned hash;
-
-	if (!list_empty(&dest->d_list)) {
-		return 0;
-	}
-
-	/*
-	 *	Hash by proto,addr,port,
-	 *	which are the parameters of the real service.
-	 */
-	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
-
-	list_add(&dest->d_list, &ip_vs_rtable[hash]);
-
-	return 1;
-}
-
-/*
- *	UNhashes ip_vs_dest from ip_vs_rtable.
- *	should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
-{
-	/*
-	 * Remove it from the ip_vs_rtable table.
-	 */
-	if (!list_empty(&dest->d_list)) {
-		list_del(&dest->d_list);
-		INIT_LIST_HEAD(&dest->d_list);
-	}
-
-	return 1;
-}
-
-/*
- *	Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
-			  const union nf_inet_addr *daddr,
-			  __be16 dport)
-{
-	unsigned hash;
-	struct ip_vs_dest *dest;
-
-	/*
-	 *	Check for "full" addressed entries
-	 *	Return the first found entry
-	 */
-	hash = ip_vs_rs_hashkey(af, daddr, dport);
-
-	read_lock(&__ip_vs_rs_lock);
-	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
-		if ((dest->af == af)
-		    && ip_vs_addr_equal(af, &dest->addr, daddr)
-		    && (dest->port == dport)
-		    && ((dest->protocol == protocol) ||
-			dest->vfwmark)) {
-			/* HIT */
-			read_unlock(&__ip_vs_rs_lock);
-			return dest;
-		}
-	}
-	read_unlock(&__ip_vs_rs_lock);
-
-	return NULL;
-}
-
-/*
- *	Lookup destination by {addr,port} in the given service
- */
-static struct ip_vs_dest *
-ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
-		  __be16 dport)
-{
-	struct ip_vs_dest *dest;
-
-	/*
-	 * Find the destination for the given service
-	 */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if ((dest->af == svc->af)
-		    && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
-		    && (dest->port == dport)) {
-			/* HIT */
-			return dest;
-		}
-	}
-
-	return NULL;
-}
-
-/*
- * Find destination by {daddr,dport,vaddr,protocol}
- * Cretaed to be used in ip_vs_process_message() in
- * the backup synchronization daemon. It finds the
- * destination to be bound to the received connection
- * on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
- */
-struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
-				   __be16 dport,
-				   const union nf_inet_addr *vaddr,
-				   __be16 vport, __u16 protocol)
-{
-	struct ip_vs_dest *dest;
-	struct ip_vs_service *svc;
-
-	svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
-	if (!svc)
-		return NULL;
-	dest = ip_vs_lookup_dest(svc, daddr, dport);
-	if (dest)
-		atomic_inc(&dest->refcnt);
-	ip_vs_service_put(svc);
-	return dest;
-}
-
-/*
- *  Lookup dest by {svc,addr,port} in the destination trash.
- *  The destination trash is used to hold the destinations that are removed
- *  from the service table but are still referenced by some conn entries.
- *  The reason to add the destination trash is when the dest is temporary
- *  down (either by administrator or by monitor program), the dest can be
- *  picked back from the trash, the remaining connections to the dest can
- *  continue, and the counting information of the dest is also useful for
- *  scheduling.
- */
-static struct ip_vs_dest *
-ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
-		     __be16 dport)
-{
-	struct ip_vs_dest *dest, *nxt;
-
-	/*
-	 * Find the destination in trash
-	 */
-	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
-		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
-			      "dest->refcnt=%d\n",
-			      dest->vfwmark,
-			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
-			      ntohs(dest->port),
-			      atomic_read(&dest->refcnt));
-		if (dest->af == svc->af &&
-		    ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
-		    dest->port == dport &&
-		    dest->vfwmark == svc->fwmark &&
-		    dest->protocol == svc->protocol &&
-		    (svc->fwmark ||
-		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
-		      dest->vport == svc->port))) {
-			/* HIT */
-			return dest;
-		}
-
-		/*
-		 * Try to purge the destination from trash if not referenced
-		 */
-		if (atomic_read(&dest->refcnt) == 1) {
-			IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
-				      "from trash\n",
-				      dest->vfwmark,
-				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
-				      ntohs(dest->port));
-			list_del(&dest->n_list);
-			ip_vs_dst_reset(dest);
-			__ip_vs_unbind_svc(dest);
-			kfree(dest);
-		}
-	}
-
-	return NULL;
-}
-
-
-/*
- *  Clean up all the destinations in the trash
- *  Called by the ip_vs_control_cleanup()
- *
- *  When the ip_vs_control_clearup is activated by ipvs module exit,
- *  the service tables must have been flushed and all the connections
- *  are expired, and the refcnt of each destination in the trash must
- *  be 1, so we simply release them here.
- */
-static void ip_vs_trash_cleanup(void)
-{
-	struct ip_vs_dest *dest, *nxt;
-
-	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
-		list_del(&dest->n_list);
-		ip_vs_dst_reset(dest);
-		__ip_vs_unbind_svc(dest);
-		kfree(dest);
-	}
-}
-
-
-static void
-ip_vs_zero_stats(struct ip_vs_stats *stats)
-{
-	spin_lock_bh(&stats->lock);
-
-	memset(&stats->ustats, 0, sizeof(stats->ustats));
-	ip_vs_zero_estimator(stats);
-
-	spin_unlock_bh(&stats->lock);
-}
-
-/*
- *	Update a destination in the given service
- */
-static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
-		    struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
-{
-	int conn_flags;
-
-	/* set the weight and the flags */
-	atomic_set(&dest->weight, udest->weight);
-	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
-	/* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6) {
-		if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
-	} else
-#endif
-		if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
-
-	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
-	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
-		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
-	} else {
-		/*
-		 *    Put the real service in ip_vs_rtable if not present.
-		 *    For now only for NAT!
-		 */
-		write_lock_bh(&__ip_vs_rs_lock);
-		ip_vs_rs_hash(dest);
-		write_unlock_bh(&__ip_vs_rs_lock);
-	}
-	atomic_set(&dest->conn_flags, conn_flags);
-
-	/* bind the service */
-	if (!dest->svc) {
-		__ip_vs_bind_svc(dest, svc);
-	} else {
-		if (dest->svc != svc) {
-			__ip_vs_unbind_svc(dest);
-			ip_vs_zero_stats(&dest->stats);
-			__ip_vs_bind_svc(dest, svc);
-		}
-	}
-
-	/* set the dest status flags */
-	dest->flags |= IP_VS_DEST_F_AVAILABLE;
-
-	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
-		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-	dest->u_threshold = udest->u_threshold;
-	dest->l_threshold = udest->l_threshold;
-}
-
-
-/*
- *	Create a destination for the given service
- */
-static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
-	       struct ip_vs_dest **dest_p)
-{
-	struct ip_vs_dest *dest;
-	unsigned atype;
-
-	EnterFunction(2);
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6) {
-		atype = ipv6_addr_type(&udest->addr.in6);
-		if ((!(atype & IPV6_ADDR_UNICAST) ||
-			atype & IPV6_ADDR_LINKLOCAL) &&
-			!__ip_vs_addr_is_local_v6(&udest->addr.in6))
-			return -EINVAL;
-	} else
-#endif
-	{
-		atype = inet_addr_type(&init_net, udest->addr.ip);
-		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
-			return -EINVAL;
-	}
-
-	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
-	if (dest == NULL) {
-		IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
-		return -ENOMEM;
-	}
-
-	dest->af = svc->af;
-	dest->protocol = svc->protocol;
-	dest->vaddr = svc->addr;
-	dest->vport = svc->port;
-	dest->vfwmark = svc->fwmark;
-	ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
-	dest->port = udest->port;
-
-	atomic_set(&dest->activeconns, 0);
-	atomic_set(&dest->inactconns, 0);
-	atomic_set(&dest->persistconns, 0);
-	atomic_set(&dest->refcnt, 0);
-
-	INIT_LIST_HEAD(&dest->d_list);
-	spin_lock_init(&dest->dst_lock);
-	spin_lock_init(&dest->stats.lock);
-	__ip_vs_update_dest(svc, dest, udest);
-	ip_vs_new_estimator(&dest->stats);
-
-	*dest_p = dest;
-
-	LeaveFunction(2);
-	return 0;
-}
-
-
-/*
- *	Add a destination into an existing service
- */
-static int
-ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
-{
-	struct ip_vs_dest *dest;
-	union nf_inet_addr daddr;
-	__be16 dport = udest->port;
-	int ret;
-
-	EnterFunction(2);
-
-	if (udest->weight < 0) {
-		IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
-		return -ERANGE;
-	}
-
-	if (udest->l_threshold > udest->u_threshold) {
-		IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
-			  "upper threshold\n");
-		return -ERANGE;
-	}
-
-	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
-
-	/*
-	 * Check if the dest already exists in the list
-	 */
-	dest = ip_vs_lookup_dest(svc, &daddr, dport);
-
-	if (dest != NULL) {
-		IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
-		return -EEXIST;
-	}
-
-	/*
-	 * Check if the dest already exists in the trash and
-	 * is from the same service
-	 */
-	dest = ip_vs_trash_get_dest(svc, &daddr, dport);
-
-	if (dest != NULL) {
-		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
-			      "dest->refcnt=%d, service %u/%s:%u\n",
-			      IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
-			      atomic_read(&dest->refcnt),
-			      dest->vfwmark,
-			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
-			      ntohs(dest->vport));
-
-		__ip_vs_update_dest(svc, dest, udest);
-
-		/*
-		 * Get the destination from the trash
-		 */
-		list_del(&dest->n_list);
-
-		ip_vs_new_estimator(&dest->stats);
-
-		write_lock_bh(&__ip_vs_svc_lock);
-
-		/*
-		 * Wait until all other svc users go away.
-		 */
-		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-		list_add(&dest->n_list, &svc->destinations);
-		svc->num_dests++;
-
-		/* call the update_service function of its scheduler */
-		if (svc->scheduler->update_service)
-			svc->scheduler->update_service(svc);
-
-		write_unlock_bh(&__ip_vs_svc_lock);
-		return 0;
-	}
-
-	/*
-	 * Allocate and initialize the dest structure
-	 */
-	ret = ip_vs_new_dest(svc, udest, &dest);
-	if (ret) {
-		return ret;
-	}
-
-	/*
-	 * Add the dest entry into the list
-	 */
-	atomic_inc(&dest->refcnt);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 * Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	list_add(&dest->n_list, &svc->destinations);
-	svc->num_dests++;
-
-	/* call the update_service function of its scheduler */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
-	LeaveFunction(2);
-
-	return 0;
-}
-
-
-/*
- *	Edit a destination in the given service
- */
-static int
-ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
-{
-	struct ip_vs_dest *dest;
-	union nf_inet_addr daddr;
-	__be16 dport = udest->port;
-
-	EnterFunction(2);
-
-	if (udest->weight < 0) {
-		IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
-		return -ERANGE;
-	}
-
-	if (udest->l_threshold > udest->u_threshold) {
-		IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
-			  "upper threshold\n");
-		return -ERANGE;
-	}
-
-	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
-
-	/*
-	 *  Lookup the destination list
-	 */
-	dest = ip_vs_lookup_dest(svc, &daddr, dport);
-
-	if (dest == NULL) {
-		IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
-		return -ENOENT;
-	}
-
-	__ip_vs_update_dest(svc, dest, udest);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/* Wait until all other svc users go away */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	/* call the update_service, because server weight may be changed */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
-	LeaveFunction(2);
-
-	return 0;
-}
-
-
-/*
- *	Delete a destination (must be already unlinked from the service)
- */
-static void __ip_vs_del_dest(struct ip_vs_dest *dest)
-{
-	ip_vs_kill_estimator(&dest->stats);
-
-	/*
-	 *  Remove it from the d-linked list with the real services.
-	 */
-	write_lock_bh(&__ip_vs_rs_lock);
-	ip_vs_rs_unhash(dest);
-	write_unlock_bh(&__ip_vs_rs_lock);
-
-	/*
-	 *  Decrease the refcnt of the dest, and free the dest
-	 *  if nobody refers to it (refcnt=0). Otherwise, throw
-	 *  the destination into the trash.
-	 */
-	if (atomic_dec_and_test(&dest->refcnt)) {
-		ip_vs_dst_reset(dest);
-		/* simply decrease svc->refcnt here, let the caller check
-		   and release the service if nobody refers to it.
-		   Only user context can release destination and service,
-		   and only one user context can update virtual service at a
-		   time, so the operation here is OK */
-		atomic_dec(&dest->svc->refcnt);
-		kfree(dest);
-	} else {
-		IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
-			      "dest->refcnt=%d\n",
-			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
-			      ntohs(dest->port),
-			      atomic_read(&dest->refcnt));
-		list_add(&dest->n_list, &ip_vs_dest_trash);
-		atomic_inc(&dest->refcnt);
-	}
-}
-
-
-/*
- *	Unlink a destination from the given service
- */
-static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
-				struct ip_vs_dest *dest,
-				int svcupd)
-{
-	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
-
-	/*
-	 *  Remove it from the d-linked destination list.
-	 */
-	list_del(&dest->n_list);
-	svc->num_dests--;
-
-	/*
-	 *  Call the update_service function of its scheduler
-	 */
-	if (svcupd && svc->scheduler->update_service)
-			svc->scheduler->update_service(svc);
-}
-
-
-/*
- *	Delete a destination server in the given service
- */
-static int
-ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
-{
-	struct ip_vs_dest *dest;
-	__be16 dport = udest->port;
-
-	EnterFunction(2);
-
-	dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
-
-	if (dest == NULL) {
-		IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
-		return -ENOENT;
-	}
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 *	Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	/*
-	 *	Unlink dest from the service
-	 */
-	__ip_vs_unlink_dest(svc, dest, 1);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 *	Delete the destination
-	 */
-	__ip_vs_del_dest(dest);
-
-	LeaveFunction(2);
-
-	return 0;
-}
-
-
-/*
- *	Add a service into the service hash table
- */
-static int
-ip_vs_add_service(struct ip_vs_service_user_kern *u,
-		  struct ip_vs_service **svc_p)
-{
-	int ret = 0;
-	struct ip_vs_scheduler *sched = NULL;
-	struct ip_vs_service *svc = NULL;
-
-	/* increase the module use count */
-	ip_vs_use_count_inc();
-
-	/* Lookup the scheduler by 'u->sched_name' */
-	sched = ip_vs_scheduler_get(u->sched_name);
-	if (sched == NULL) {
-		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
-			   u->sched_name);
-		ret = -ENOENT;
-		goto out_mod_dec;
-	}
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (u->af == AF_INET6) {
-		if (!sched->supports_ipv6) {
-			ret = -EAFNOSUPPORT;
-			goto out_err;
-		}
-		if ((u->netmask < 1) || (u->netmask > 128)) {
-			ret = -EINVAL;
-			goto out_err;
-		}
-	}
-#endif
-
-	svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
-	if (svc == NULL) {
-		IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
-		ret = -ENOMEM;
-		goto out_err;
-	}
-
-	/* I'm the first user of the service */
-	atomic_set(&svc->usecnt, 1);
-	atomic_set(&svc->refcnt, 0);
-
-	svc->af = u->af;
-	svc->protocol = u->protocol;
-	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
-	svc->port = u->port;
-	svc->fwmark = u->fwmark;
-	svc->flags = u->flags;
-	svc->timeout = u->timeout * HZ;
-	svc->netmask = u->netmask;
-
-	INIT_LIST_HEAD(&svc->destinations);
-	rwlock_init(&svc->sched_lock);
-	spin_lock_init(&svc->stats.lock);
-
-	/* Bind the scheduler */
-	ret = ip_vs_bind_scheduler(svc, sched);
-	if (ret)
-		goto out_err;
-	sched = NULL;
-
-	/* Update the virtual service counters */
-	if (svc->port == FTPPORT)
-		atomic_inc(&ip_vs_ftpsvc_counter);
-	else if (svc->port == 0)
-		atomic_inc(&ip_vs_nullsvc_counter);
-
-	ip_vs_new_estimator(&svc->stats);
-
-	/* Count only IPv4 services for old get/setsockopt interface */
-	if (svc->af == AF_INET)
-		ip_vs_num_services++;
-
-	/* Hash the service into the service table */
-	write_lock_bh(&__ip_vs_svc_lock);
-	ip_vs_svc_hash(svc);
-	write_unlock_bh(&__ip_vs_svc_lock);
-
-	*svc_p = svc;
-	return 0;
-
-  out_err:
-	if (svc != NULL) {
-		if (svc->scheduler)
-			ip_vs_unbind_scheduler(svc);
-		if (svc->inc) {
-			local_bh_disable();
-			ip_vs_app_inc_put(svc->inc);
-			local_bh_enable();
-		}
-		kfree(svc);
-	}
-	ip_vs_scheduler_put(sched);
-
-  out_mod_dec:
-	/* decrease the module use count */
-	ip_vs_use_count_dec();
-
-	return ret;
-}
-
-
-/*
- *	Edit a service and bind it with a new scheduler
- */
-static int
-ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
-{
-	struct ip_vs_scheduler *sched, *old_sched;
-	int ret = 0;
-
-	/*
-	 * Lookup the scheduler, by 'u->sched_name'
-	 */
-	sched = ip_vs_scheduler_get(u->sched_name);
-	if (sched == NULL) {
-		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
-			   u->sched_name);
-		return -ENOENT;
-	}
-	old_sched = sched;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (u->af == AF_INET6) {
-		if (!sched->supports_ipv6) {
-			ret = -EAFNOSUPPORT;
-			goto out;
-		}
-		if ((u->netmask < 1) || (u->netmask > 128)) {
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-#endif
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 * Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	/*
-	 * Set the flags and timeout value
-	 */
-	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
-	svc->timeout = u->timeout * HZ;
-	svc->netmask = u->netmask;
-
-	old_sched = svc->scheduler;
-	if (sched != old_sched) {
-		/*
-		 * Unbind the old scheduler
-		 */
-		if ((ret = ip_vs_unbind_scheduler(svc))) {
-			old_sched = sched;
-			goto out_unlock;
-		}
-
-		/*
-		 * Bind the new scheduler
-		 */
-		if ((ret = ip_vs_bind_scheduler(svc, sched))) {
-			/*
-			 * If ip_vs_bind_scheduler fails, restore the old
-			 * scheduler.
-			 * The main reason of failure is out of memory.
-			 *
-			 * The question is if the old scheduler can be
-			 * restored all the time. TODO: if it cannot be
-			 * restored some time, we must delete the service,
-			 * otherwise the system may crash.
-			 */
-			ip_vs_bind_scheduler(svc, old_sched);
-			old_sched = sched;
-			goto out_unlock;
-		}
-	}
-
-  out_unlock:
-	write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
-  out:
-#endif
-
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
-
-	return ret;
-}
-
-
-/*
- *	Delete a service from the service list
- *	- The service must be unlinked, unlocked and not referenced!
- *	- We are called under _bh lock
- */
-static void __ip_vs_del_service(struct ip_vs_service *svc)
-{
-	struct ip_vs_dest *dest, *nxt;
-	struct ip_vs_scheduler *old_sched;
-
-	/* Count only IPv4 services for old get/setsockopt interface */
-	if (svc->af == AF_INET)
-		ip_vs_num_services--;
-
-	ip_vs_kill_estimator(&svc->stats);
-
-	/* Unbind scheduler */
-	old_sched = svc->scheduler;
-	ip_vs_unbind_scheduler(svc);
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
-
-	/* Unbind app inc */
-	if (svc->inc) {
-		ip_vs_app_inc_put(svc->inc);
-		svc->inc = NULL;
-	}
-
-	/*
-	 *    Unlink the whole destination list
-	 */
-	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
-		__ip_vs_unlink_dest(svc, dest, 0);
-		__ip_vs_del_dest(dest);
-	}
-
-	/*
-	 *    Update the virtual service counters
-	 */
-	if (svc->port == FTPPORT)
-		atomic_dec(&ip_vs_ftpsvc_counter);
-	else if (svc->port == 0)
-		atomic_dec(&ip_vs_nullsvc_counter);
-
-	/*
-	 *    Free the service if nobody refers to it
-	 */
-	if (atomic_read(&svc->refcnt) == 0)
-		kfree(svc);
-
-	/* decrease the module use count */
-	ip_vs_use_count_dec();
-}
-
-/*
- *	Delete a service from the service list
- */
-static int ip_vs_del_service(struct ip_vs_service *svc)
-{
-	if (svc == NULL)
-		return -EEXIST;
-
-	/*
-	 * Unhash it from the service table
-	 */
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	ip_vs_svc_unhash(svc);
-
-	/*
-	 * Wait until all the svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	__ip_vs_del_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
-	return 0;
-}
-
-
-/*
- *	Flush all the virtual services
- */
-static int ip_vs_flush(void)
-{
-	int idx;
-	struct ip_vs_service *svc, *nxt;
-
-	/*
-	 * Flush the service table hashed by <protocol,addr,port>
-	 */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
-		}
-	}
-
-	/*
-	 * Flush the service table hashed by fwmark
-	 */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry_safe(svc, nxt,
-					 &ip_vs_svc_fwm_table[idx], f_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
-		}
-	}
-
-	return 0;
-}
-
-
-/*
- *	Zero counters in a service or all services
- */
-static int ip_vs_zero_service(struct ip_vs_service *svc)
-{
-	struct ip_vs_dest *dest;
-
-	write_lock_bh(&__ip_vs_svc_lock);
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		ip_vs_zero_stats(&dest->stats);
-	}
-	ip_vs_zero_stats(&svc->stats);
-	write_unlock_bh(&__ip_vs_svc_lock);
-	return 0;
-}
-
-static int ip_vs_zero_all(void)
-{
-	int idx;
-	struct ip_vs_service *svc;
-
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-			ip_vs_zero_service(svc);
-		}
-	}
-
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-			ip_vs_zero_service(svc);
-		}
-	}
-
-	ip_vs_zero_stats(&ip_vs_stats);
-	return 0;
-}
-
-
-static int
-proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
-		     void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int *valp = table->data;
-	int val = *valp;
-	int rc;
-
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
-	if (write && (*valp != val)) {
-		if ((*valp < 0) || (*valp > 3)) {
-			/* Restore the correct value */
-			*valp = val;
-		} else {
-			update_defense_level();
-		}
-	}
-	return rc;
-}
-
-
-static int
-proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
-		       void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int *valp = table->data;
-	int val[2];
-	int rc;
-
-	/* backup the value first */
-	memcpy(val, valp, sizeof(val));
-
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
-	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
-		/* Restore the correct value */
-		memcpy(valp, val, sizeof(val));
-	}
-	return rc;
-}
-
-
-/*
- *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
- */
-
-static struct ctl_table vs_vars[] = {
-	{
-		.procname	= "amemthresh",
-		.data		= &sysctl_ip_vs_amemthresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-#ifdef CONFIG_IP_VS_DEBUG
-	{
-		.procname	= "debug_level",
-		.data		= &sysctl_ip_vs_debug_level,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-#endif
-	{
-		.procname	= "am_droprate",
-		.data		= &sysctl_ip_vs_am_droprate,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.procname	= "drop_entry",
-		.data		= &sysctl_ip_vs_drop_entry,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_do_defense_mode,
-	},
-	{
-		.procname	= "drop_packet",
-		.data		= &sysctl_ip_vs_drop_packet,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_do_defense_mode,
-	},
-	{
-		.procname	= "secure_tcp",
-		.data		= &sysctl_ip_vs_secure_tcp,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_do_defense_mode,
-	},
-#if 0
-	{
-		.procname	= "timeout_established",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_synsent",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_synrecv",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_finwait",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_timewait",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_close",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_closewait",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_lastack",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_listen",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_synack",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_udp",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "timeout_icmp",
-		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-#endif
-	{
-		.procname	= "cache_bypass",
-		.data		= &sysctl_ip_vs_cache_bypass,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.procname	= "expire_nodest_conn",
-		.data		= &sysctl_ip_vs_expire_nodest_conn,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.procname	= "expire_quiescent_template",
-		.data		= &sysctl_ip_vs_expire_quiescent_template,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.procname	= "sync_threshold",
-		.data		= &sysctl_ip_vs_sync_threshold,
-		.maxlen		= sizeof(sysctl_ip_vs_sync_threshold),
-		.mode		= 0644,
-		.proc_handler	= &proc_do_sync_threshold,
-	},
-	{
-		.procname	= "nat_icmp_send",
-		.data		= &sysctl_ip_vs_nat_icmp_send,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{ .ctl_name = 0 }
-};
-
-const struct ctl_path net_vs_ctl_path[] = {
-	{ .procname = "net", .ctl_name = CTL_NET, },
-	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
-	{ .procname = "vs", },
-	{ }
-};
-EXPORT_SYMBOL_GPL(net_vs_ctl_path);
-
-static struct ctl_table_header * sysctl_header;
-
-#ifdef CONFIG_PROC_FS
-
-struct ip_vs_iter {
-	struct list_head *table;
-	int bucket;
-};
-
-/*
- *	Write the contents of the VS rule table to a PROCfs file.
- *	(It is kept just for backward compatibility)
- */
-static inline const char *ip_vs_fwd_name(unsigned flags)
-{
-	switch (flags & IP_VS_CONN_F_FWD_MASK) {
-	case IP_VS_CONN_F_LOCALNODE:
-		return "Local";
-	case IP_VS_CONN_F_TUNNEL:
-		return "Tunnel";
-	case IP_VS_CONN_F_DROUTE:
-		return "Route";
-	default:
-		return "Masq";
-	}
-}
-
-
-/* Get the Nth entry in the two lists */
-static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
-{
-	struct ip_vs_iter *iter = seq->private;
-	int idx;
-	struct ip_vs_service *svc;
-
-	/* look in hash by protocol */
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-			if (pos-- == 0){
-				iter->table = ip_vs_svc_table;
-				iter->bucket = idx;
-				return svc;
-			}
-		}
-	}
-
-	/* keep looking in fwmark */
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-			if (pos-- == 0) {
-				iter->table = ip_vs_svc_fwm_table;
-				iter->bucket = idx;
-				return svc;
-			}
-		}
-	}
-
-	return NULL;
-}
-
-static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-__acquires(__ip_vs_svc_lock)
-{
-
-	read_lock_bh(&__ip_vs_svc_lock);
-	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
-}
-
-
-static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct list_head *e;
-	struct ip_vs_iter *iter;
-	struct ip_vs_service *svc;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return ip_vs_info_array(seq,0);
-
-	svc = v;
-	iter = seq->private;
-
-	if (iter->table == ip_vs_svc_table) {
-		/* next service in table hashed by protocol */
-		if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
-			return list_entry(e, struct ip_vs_service, s_list);
-
-
-		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-			list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
-					    s_list) {
-				return svc;
-			}
-		}
-
-		iter->table = ip_vs_svc_fwm_table;
-		iter->bucket = -1;
-		goto scan_fwmark;
-	}
-
-	/* next service in hashed by fwmark */
-	if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
-		return list_entry(e, struct ip_vs_service, f_list);
-
- scan_fwmark:
-	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
-				    f_list)
-			return svc;
-	}
-
-	return NULL;
-}
-
-static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-__releases(__ip_vs_svc_lock)
-{
-	read_unlock_bh(&__ip_vs_svc_lock);
-}
-
-
-static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
-{
-	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq,
-			"IP Virtual Server version %d.%d.%d (size=%d)\n",
-			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
-		seq_puts(seq,
-			 "Prot LocalAddress:Port Scheduler Flags\n");
-		seq_puts(seq,
-			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
-	} else {
-		const struct ip_vs_service *svc = v;
-		const struct ip_vs_iter *iter = seq->private;
-		const struct ip_vs_dest *dest;
-
-		if (iter->table == ip_vs_svc_table) {
-#ifdef CONFIG_IP_VS_IPV6
-			if (svc->af == AF_INET6)
-				seq_printf(seq, "%s  [" NIP6_FMT "]:%04X %s ",
-					   ip_vs_proto_name(svc->protocol),
-					   NIP6(svc->addr.in6),
-					   ntohs(svc->port),
-					   svc->scheduler->name);
-			else
-#endif
-				seq_printf(seq, "%s  %08X:%04X %s ",
-					   ip_vs_proto_name(svc->protocol),
-					   ntohl(svc->addr.ip),
-					   ntohs(svc->port),
-					   svc->scheduler->name);
-		} else {
-			seq_printf(seq, "FWM  %08X %s ",
-				   svc->fwmark, svc->scheduler->name);
-		}
-
-		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
-			seq_printf(seq, "persistent %d %08X\n",
-				svc->timeout,
-				ntohl(svc->netmask));
-		else
-			seq_putc(seq, '\n');
-
-		list_for_each_entry(dest, &svc->destinations, n_list) {
-#ifdef CONFIG_IP_VS_IPV6
-			if (dest->af == AF_INET6)
-				seq_printf(seq,
-					   "  -> [" NIP6_FMT "]:%04X"
-					   "      %-7s %-6d %-10d %-10d\n",
-					   NIP6(dest->addr.in6),
-					   ntohs(dest->port),
-					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
-					   atomic_read(&dest->weight),
-					   atomic_read(&dest->activeconns),
-					   atomic_read(&dest->inactconns));
-			else
-#endif
-				seq_printf(seq,
-					   "  -> %08X:%04X      "
-					   "%-7s %-6d %-10d %-10d\n",
-					   ntohl(dest->addr.ip),
-					   ntohs(dest->port),
-					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
-					   atomic_read(&dest->weight),
-					   atomic_read(&dest->activeconns),
-					   atomic_read(&dest->inactconns));
-
-		}
-	}
-	return 0;
-}
-
-static const struct seq_operations ip_vs_info_seq_ops = {
-	.start = ip_vs_info_seq_start,
-	.next  = ip_vs_info_seq_next,
-	.stop  = ip_vs_info_seq_stop,
-	.show  = ip_vs_info_seq_show,
-};
-
-static int ip_vs_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &ip_vs_info_seq_ops,
-			sizeof(struct ip_vs_iter));
-}
-
-static const struct file_operations ip_vs_info_fops = {
-	.owner	 = THIS_MODULE,
-	.open    = ip_vs_info_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_private,
-};
-
-#endif
-
-struct ip_vs_stats ip_vs_stats = {
-	.lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
-};
-
-#ifdef CONFIG_PROC_FS
-static int ip_vs_stats_show(struct seq_file *seq, void *v)
-{
-
-/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
-	seq_puts(seq,
-		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
-	seq_printf(seq,
-		   "   Conns  Packets  Packets            Bytes            Bytes\n");
-
-	spin_lock_bh(&ip_vs_stats.lock);
-	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
-		   ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
-		   (unsigned long long) ip_vs_stats.ustats.inbytes,
-		   (unsigned long long) ip_vs_stats.ustats.outbytes);
-
-/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
-	seq_puts(seq,
-		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
-	seq_printf(seq,"%8X %8X %8X %16X %16X\n",
-			ip_vs_stats.ustats.cps,
-			ip_vs_stats.ustats.inpps,
-			ip_vs_stats.ustats.outpps,
-			ip_vs_stats.ustats.inbps,
-			ip_vs_stats.ustats.outbps);
-	spin_unlock_bh(&ip_vs_stats.lock);
-
-	return 0;
-}
-
-static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ip_vs_stats_show, NULL);
-}
-
-static const struct file_operations ip_vs_stats_fops = {
-	.owner = THIS_MODULE,
-	.open = ip_vs_stats_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-#endif
-
-/*
- *	Set timeout values for tcp tcpfin udp in the timeout_table.
- */
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
-{
-	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
-		  u->tcp_timeout,
-		  u->tcp_fin_timeout,
-		  u->udp_timeout);
-
-#ifdef CONFIG_IP_VS_PROTO_TCP
-	if (u->tcp_timeout) {
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
-			= u->tcp_timeout * HZ;
-	}
-
-	if (u->tcp_fin_timeout) {
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
-			= u->tcp_fin_timeout * HZ;
-	}
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_UDP
-	if (u->udp_timeout) {
-		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
-			= u->udp_timeout * HZ;
-	}
-#endif
-	return 0;
-}
-
-
-#define SET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
-#define SERVICE_ARG_LEN		(sizeof(struct ip_vs_service_user))
-#define SVCDEST_ARG_LEN		(sizeof(struct ip_vs_service_user) +	\
-				 sizeof(struct ip_vs_dest_user))
-#define TIMEOUT_ARG_LEN		(sizeof(struct ip_vs_timeout_user))
-#define DAEMON_ARG_LEN		(sizeof(struct ip_vs_daemon_user))
-#define MAX_ARG_LEN		SVCDEST_ARG_LEN
-
-static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
-	[SET_CMDID(IP_VS_SO_SET_ADD)]		= SERVICE_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_EDIT)]		= SERVICE_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_DEL)]		= SERVICE_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_FLUSH)]		= 0,
-	[SET_CMDID(IP_VS_SO_SET_ADDDEST)]	= SVCDEST_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_DELDEST)]	= SVCDEST_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_EDITDEST)]	= SVCDEST_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_TIMEOUT)]	= TIMEOUT_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]	= DAEMON_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]	= DAEMON_ARG_LEN,
-	[SET_CMDID(IP_VS_SO_SET_ZERO)]		= SERVICE_ARG_LEN,
-};
-
-static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
-				  struct ip_vs_service_user *usvc_compat)
-{
-	usvc->af		= AF_INET;
-	usvc->protocol		= usvc_compat->protocol;
-	usvc->addr.ip		= usvc_compat->addr;
-	usvc->port		= usvc_compat->port;
-	usvc->fwmark		= usvc_compat->fwmark;
-
-	/* Deep copy of sched_name is not needed here */
-	usvc->sched_name	= usvc_compat->sched_name;
-
-	usvc->flags		= usvc_compat->flags;
-	usvc->timeout		= usvc_compat->timeout;
-	usvc->netmask		= usvc_compat->netmask;
-}
-
-static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
-				   struct ip_vs_dest_user *udest_compat)
-{
-	udest->addr.ip		= udest_compat->addr;
-	udest->port		= udest_compat->port;
-	udest->conn_flags	= udest_compat->conn_flags;
-	udest->weight		= udest_compat->weight;
-	udest->u_threshold	= udest_compat->u_threshold;
-	udest->l_threshold	= udest_compat->l_threshold;
-}
-
-static int
-do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
-{
-	int ret;
-	unsigned char arg[MAX_ARG_LEN];
-	struct ip_vs_service_user *usvc_compat;
-	struct ip_vs_service_user_kern usvc;
-	struct ip_vs_service *svc;
-	struct ip_vs_dest_user *udest_compat;
-	struct ip_vs_dest_user_kern udest;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (len != set_arglen[SET_CMDID(cmd)]) {
-		IP_VS_ERR("set_ctl: len %u != %u\n",
-			  len, set_arglen[SET_CMDID(cmd)]);
-		return -EINVAL;
-	}
-
-	if (copy_from_user(arg, user, len) != 0)
-		return -EFAULT;
-
-	/* increase the module use count */
-	ip_vs_use_count_inc();
-
-	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
-		ret = -ERESTARTSYS;
-		goto out_dec;
-	}
-
-	if (cmd == IP_VS_SO_SET_FLUSH) {
-		/* Flush the virtual service */
-		ret = ip_vs_flush();
-		goto out_unlock;
-	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
-		/* Set timeout values for (tcp tcpfin udp) */
-		ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
-		goto out_unlock;
-	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
-		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-		ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
-		goto out_unlock;
-	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
-		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-		ret = stop_sync_thread(dm->state);
-		goto out_unlock;
-	}
-
-	usvc_compat = (struct ip_vs_service_user *)arg;
-	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
-
-	/* We only use the new structs internally, so copy userspace compat
-	 * structs to extended internal versions */
-	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
-	ip_vs_copy_udest_compat(&udest, udest_compat);
-
-	if (cmd == IP_VS_SO_SET_ZERO) {
-		/* if no service address is set, zero counters in all */
-		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
-			ret = ip_vs_zero_all();
-			goto out_unlock;
-		}
-	}
-
-	/* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
-	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
-		IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
-			  usvc.protocol, NIPQUAD(usvc.addr.ip),
-			  ntohs(usvc.port), usvc.sched_name);
-		ret = -EFAULT;
-		goto out_unlock;
-	}
-
-	/* Lookup the exact service by <protocol, addr, port> or fwmark */
-	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
-	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
-	if (cmd != IP_VS_SO_SET_ADD
-	    && (svc == NULL || svc->protocol != usvc.protocol)) {
-		ret = -ESRCH;
-		goto out_unlock;
-	}
-
-	switch (cmd) {
-	case IP_VS_SO_SET_ADD:
-		if (svc != NULL)
-			ret = -EEXIST;
-		else
-			ret = ip_vs_add_service(&usvc, &svc);
-		break;
-	case IP_VS_SO_SET_EDIT:
-		ret = ip_vs_edit_service(svc, &usvc);
-		break;
-	case IP_VS_SO_SET_DEL:
-		ret = ip_vs_del_service(svc);
-		if (!ret)
-			goto out_unlock;
-		break;
-	case IP_VS_SO_SET_ZERO:
-		ret = ip_vs_zero_service(svc);
-		break;
-	case IP_VS_SO_SET_ADDDEST:
-		ret = ip_vs_add_dest(svc, &udest);
-		break;
-	case IP_VS_SO_SET_EDITDEST:
-		ret = ip_vs_edit_dest(svc, &udest);
-		break;
-	case IP_VS_SO_SET_DELDEST:
-		ret = ip_vs_del_dest(svc, &udest);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	if (svc)
-		ip_vs_service_put(svc);
-
-  out_unlock:
-	mutex_unlock(&__ip_vs_mutex);
-  out_dec:
-	/* decrease the module use count */
-	ip_vs_use_count_dec();
-
-	return ret;
-}
-
-
-static void
-ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
-{
-	spin_lock_bh(&src->lock);
-	memcpy(dst, &src->ustats, sizeof(*dst));
-	spin_unlock_bh(&src->lock);
-}
-
-static void
-ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
-{
-	dst->protocol = src->protocol;
-	dst->addr = src->addr.ip;
-	dst->port = src->port;
-	dst->fwmark = src->fwmark;
-	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
-	dst->flags = src->flags;
-	dst->timeout = src->timeout / HZ;
-	dst->netmask = src->netmask;
-	dst->num_dests = src->num_dests;
-	ip_vs_copy_stats(&dst->stats, &src->stats);
-}
-
-static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
-			    struct ip_vs_get_services __user *uptr)
-{
-	int idx, count=0;
-	struct ip_vs_service *svc;
-	struct ip_vs_service_entry entry;
-	int ret = 0;
-
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-			/* Only expose IPv4 entries to old interface */
-			if (svc->af != AF_INET)
-				continue;
-
-			if (count >= get->num_services)
-				goto out;
-			memset(&entry, 0, sizeof(entry));
-			ip_vs_copy_service(&entry, svc);
-			if (copy_to_user(&uptr->entrytable[count],
-					 &entry, sizeof(entry))) {
-				ret = -EFAULT;
-				goto out;
-			}
-			count++;
-		}
-	}
-
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-			/* Only expose IPv4 entries to old interface */
-			if (svc->af != AF_INET)
-				continue;
-
-			if (count >= get->num_services)
-				goto out;
-			memset(&entry, 0, sizeof(entry));
-			ip_vs_copy_service(&entry, svc);
-			if (copy_to_user(&uptr->entrytable[count],
-					 &entry, sizeof(entry))) {
-				ret = -EFAULT;
-				goto out;
-			}
-			count++;
-		}
-	}
-  out:
-	return ret;
-}
-
-static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
-			 struct ip_vs_get_dests __user *uptr)
-{
-	struct ip_vs_service *svc;
-	union nf_inet_addr addr = { .ip = get->addr };
-	int ret = 0;
-
-	if (get->fwmark)
-		svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
-	else
-		svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
-					  get->port);
-
-	if (svc) {
-		int count = 0;
-		struct ip_vs_dest *dest;
-		struct ip_vs_dest_entry entry;
-
-		list_for_each_entry(dest, &svc->destinations, n_list) {
-			if (count >= get->num_dests)
-				break;
-
-			entry.addr = dest->addr.ip;
-			entry.port = dest->port;
-			entry.conn_flags = atomic_read(&dest->conn_flags);
-			entry.weight = atomic_read(&dest->weight);
-			entry.u_threshold = dest->u_threshold;
-			entry.l_threshold = dest->l_threshold;
-			entry.activeconns = atomic_read(&dest->activeconns);
-			entry.inactconns = atomic_read(&dest->inactconns);
-			entry.persistconns = atomic_read(&dest->persistconns);
-			ip_vs_copy_stats(&entry.stats, &dest->stats);
-			if (copy_to_user(&uptr->entrytable[count],
-					 &entry, sizeof(entry))) {
-				ret = -EFAULT;
-				break;
-			}
-			count++;
-		}
-		ip_vs_service_put(svc);
-	} else
-		ret = -ESRCH;
-	return ret;
-}
-
-static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
-{
-#ifdef CONFIG_IP_VS_PROTO_TCP
-	u->tcp_timeout =
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
-	u->tcp_fin_timeout =
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
-	u->udp_timeout =
-		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
-#endif
-}
-
-
-#define GET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
-#define GET_INFO_ARG_LEN	(sizeof(struct ip_vs_getinfo))
-#define GET_SERVICES_ARG_LEN	(sizeof(struct ip_vs_get_services))
-#define GET_SERVICE_ARG_LEN	(sizeof(struct ip_vs_service_entry))
-#define GET_DESTS_ARG_LEN	(sizeof(struct ip_vs_get_dests))
-#define GET_TIMEOUT_ARG_LEN	(sizeof(struct ip_vs_timeout_user))
-#define GET_DAEMON_ARG_LEN	(sizeof(struct ip_vs_daemon_user) * 2)
-
-static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
-	[GET_CMDID(IP_VS_SO_GET_VERSION)]	= 64,
-	[GET_CMDID(IP_VS_SO_GET_INFO)]		= GET_INFO_ARG_LEN,
-	[GET_CMDID(IP_VS_SO_GET_SERVICES)]	= GET_SERVICES_ARG_LEN,
-	[GET_CMDID(IP_VS_SO_GET_SERVICE)]	= GET_SERVICE_ARG_LEN,
-	[GET_CMDID(IP_VS_SO_GET_DESTS)]		= GET_DESTS_ARG_LEN,
-	[GET_CMDID(IP_VS_SO_GET_TIMEOUT)]	= GET_TIMEOUT_ARG_LEN,
-	[GET_CMDID(IP_VS_SO_GET_DAEMON)]	= GET_DAEMON_ARG_LEN,
-};
-
-static int
-do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
-	unsigned char arg[128];
-	int ret = 0;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (*len < get_arglen[GET_CMDID(cmd)]) {
-		IP_VS_ERR("get_ctl: len %u < %u\n",
-			  *len, get_arglen[GET_CMDID(cmd)]);
-		return -EINVAL;
-	}
-
-	if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
-		return -EFAULT;
-
-	if (mutex_lock_interruptible(&__ip_vs_mutex))
-		return -ERESTARTSYS;
-
-	switch (cmd) {
-	case IP_VS_SO_GET_VERSION:
-	{
-		char buf[64];
-
-		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
-			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
-		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
-			ret = -EFAULT;
-			goto out;
-		}
-		*len = strlen(buf)+1;
-	}
-	break;
-
-	case IP_VS_SO_GET_INFO:
-	{
-		struct ip_vs_getinfo info;
-		info.version = IP_VS_VERSION_CODE;
-		info.size = IP_VS_CONN_TAB_SIZE;
-		info.num_services = ip_vs_num_services;
-		if (copy_to_user(user, &info, sizeof(info)) != 0)
-			ret = -EFAULT;
-	}
-	break;
-
-	case IP_VS_SO_GET_SERVICES:
-	{
-		struct ip_vs_get_services *get;
-		int size;
-
-		get = (struct ip_vs_get_services *)arg;
-		size = sizeof(*get) +
-			sizeof(struct ip_vs_service_entry) * get->num_services;
-		if (*len != size) {
-			IP_VS_ERR("length: %u != %u\n", *len, size);
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = __ip_vs_get_service_entries(get, user);
-	}
-	break;
-
-	case IP_VS_SO_GET_SERVICE:
-	{
-		struct ip_vs_service_entry *entry;
-		struct ip_vs_service *svc;
-		union nf_inet_addr addr;
-
-		entry = (struct ip_vs_service_entry *)arg;
-		addr.ip = entry->addr;
-		if (entry->fwmark)
-			svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
-		else
-			svc = __ip_vs_service_get(AF_INET, entry->protocol,
-						  &addr, entry->port);
-		if (svc) {
-			ip_vs_copy_service(entry, svc);
-			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
-				ret = -EFAULT;
-			ip_vs_service_put(svc);
-		} else
-			ret = -ESRCH;
-	}
-	break;
-
-	case IP_VS_SO_GET_DESTS:
-	{
-		struct ip_vs_get_dests *get;
-		int size;
-
-		get = (struct ip_vs_get_dests *)arg;
-		size = sizeof(*get) +
-			sizeof(struct ip_vs_dest_entry) * get->num_dests;
-		if (*len != size) {
-			IP_VS_ERR("length: %u != %u\n", *len, size);
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = __ip_vs_get_dest_entries(get, user);
-	}
-	break;
-
-	case IP_VS_SO_GET_TIMEOUT:
-	{
-		struct ip_vs_timeout_user t;
-
-		__ip_vs_get_timeouts(&t);
-		if (copy_to_user(user, &t, sizeof(t)) != 0)
-			ret = -EFAULT;
-	}
-	break;
-
-	case IP_VS_SO_GET_DAEMON:
-	{
-		struct ip_vs_daemon_user d[2];
-
-		memset(&d, 0, sizeof(d));
-		if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
-			d[0].state = IP_VS_STATE_MASTER;
-			strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
-			d[0].syncid = ip_vs_master_syncid;
-		}
-		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
-			d[1].state = IP_VS_STATE_BACKUP;
-			strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
-			d[1].syncid = ip_vs_backup_syncid;
-		}
-		if (copy_to_user(user, &d, sizeof(d)) != 0)
-			ret = -EFAULT;
-	}
-	break;
-
-	default:
-		ret = -EINVAL;
-	}
-
-  out:
-	mutex_unlock(&__ip_vs_mutex);
-	return ret;
-}
-
-
-static struct nf_sockopt_ops ip_vs_sockopts = {
-	.pf		= PF_INET,
-	.set_optmin	= IP_VS_BASE_CTL,
-	.set_optmax	= IP_VS_SO_SET_MAX+1,
-	.set		= do_ip_vs_set_ctl,
-	.get_optmin	= IP_VS_BASE_CTL,
-	.get_optmax	= IP_VS_SO_GET_MAX+1,
-	.get		= do_ip_vs_get_ctl,
-	.owner		= THIS_MODULE,
-};
-
-/*
- * Generic Netlink interface
- */
-
-/* IPVS genetlink family */
-static struct genl_family ip_vs_genl_family = {
-	.id		= GENL_ID_GENERATE,
-	.hdrsize	= 0,
-	.name		= IPVS_GENL_NAME,
-	.version	= IPVS_GENL_VERSION,
-	.maxattr	= IPVS_CMD_MAX,
-};
-
-/* Policy used for first-level command attributes */
-static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
-	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
-	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
-	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
-	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
-	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
-	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
-};
-
-/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
-static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
-	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
-	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
-					    .len = IP_VS_IFNAME_MAXLEN },
-	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
-};
-
-/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
-static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
-	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
-	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
-	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
-					    .len = sizeof(union nf_inet_addr) },
-	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
-	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
-	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
-					    .len = IP_VS_SCHEDNAME_MAXLEN },
-	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
-					    .len = sizeof(struct ip_vs_flags) },
-	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
-	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
-	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
-};
-
-/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
-static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
-	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
-					    .len = sizeof(union nf_inet_addr) },
-	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
-	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
-	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
-	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
-	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
-	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
-	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
-	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
-	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
-};
-
-static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
-				 struct ip_vs_stats *stats)
-{
-	struct nlattr *nl_stats = nla_nest_start(skb, container_type);
-	if (!nl_stats)
-		return -EMSGSIZE;
-
-	spin_lock_bh(&stats->lock);
-
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
-	NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
-	NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
-	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
-
-	spin_unlock_bh(&stats->lock);
-
-	nla_nest_end(skb, nl_stats);
-
-	return 0;
-
-nla_put_failure:
-	spin_unlock_bh(&stats->lock);
-	nla_nest_cancel(skb, nl_stats);
-	return -EMSGSIZE;
-}
-
-static int ip_vs_genl_fill_service(struct sk_buff *skb,
-				   struct ip_vs_service *svc)
-{
-	struct nlattr *nl_service;
-	struct ip_vs_flags flags = { .flags = svc->flags,
-				     .mask = ~0 };
-
-	nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
-	if (!nl_service)
-		return -EMSGSIZE;
-
-	NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
-
-	if (svc->fwmark) {
-		NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
-	} else {
-		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
-		NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
-		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
-	}
-
-	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
-	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
-	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
-	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
-
-	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
-		goto nla_put_failure;
-
-	nla_nest_end(skb, nl_service);
-
-	return 0;
-
-nla_put_failure:
-	nla_nest_cancel(skb, nl_service);
-	return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_service(struct sk_buff *skb,
-				   struct ip_vs_service *svc,
-				   struct netlink_callback *cb)
-{
-	void *hdr;
-
-	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-			  &ip_vs_genl_family, NLM_F_MULTI,
-			  IPVS_CMD_NEW_SERVICE);
-	if (!hdr)
-		return -EMSGSIZE;
-
-	if (ip_vs_genl_fill_service(skb, svc) < 0)
-		goto nla_put_failure;
-
-	return genlmsg_end(skb, hdr);
-
-nla_put_failure:
-	genlmsg_cancel(skb, hdr);
-	return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_services(struct sk_buff *skb,
-				    struct netlink_callback *cb)
-{
-	int idx = 0, i;
-	int start = cb->args[0];
-	struct ip_vs_service *svc;
-
-	mutex_lock(&__ip_vs_mutex);
-	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
-			if (++idx <= start)
-				continue;
-			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
-				idx--;
-				goto nla_put_failure;
-			}
-		}
-	}
-
-	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
-			if (++idx <= start)
-				continue;
-			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
-				idx--;
-				goto nla_put_failure;
-			}
-		}
-	}
-
-nla_put_failure:
-	mutex_unlock(&__ip_vs_mutex);
-	cb->args[0] = idx;
-
-	return skb->len;
-}
-
-static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
-				    struct nlattr *nla, int full_entry)
-{
-	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
-	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
-
-	/* Parse mandatory identifying service fields first */
-	if (nla == NULL ||
-	    nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
-		return -EINVAL;
-
-	nla_af		= attrs[IPVS_SVC_ATTR_AF];
-	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
-	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
-	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
-	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];
-
-	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
-		return -EINVAL;
-
-	usvc->af = nla_get_u16(nla_af);
-#ifdef CONFIG_IP_VS_IPV6
-	if (usvc->af != AF_INET && usvc->af != AF_INET6)
-#else
-	if (usvc->af != AF_INET)
-#endif
-		return -EAFNOSUPPORT;
-
-	if (nla_fwmark) {
-		usvc->protocol = IPPROTO_TCP;
-		usvc->fwmark = nla_get_u32(nla_fwmark);
-	} else {
-		usvc->protocol = nla_get_u16(nla_protocol);
-		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
-		usvc->port = nla_get_u16(nla_port);
-		usvc->fwmark = 0;
-	}
-
-	/* If a full entry was requested, check for the additional fields */
-	if (full_entry) {
-		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
-			      *nla_netmask;
-		struct ip_vs_flags flags;
-		struct ip_vs_service *svc;
-
-		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
-		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
-		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
-		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
-
-		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
-			return -EINVAL;
-
-		nla_memcpy(&flags, nla_flags, sizeof(flags));
-
-		/* prefill flags from service if it already exists */
-		if (usvc->fwmark)
-			svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
-		else
-			svc = __ip_vs_service_get(usvc->af, usvc->protocol,
-						  &usvc->addr, usvc->port);
-		if (svc) {
-			usvc->flags = svc->flags;
-			ip_vs_service_put(svc);
-		} else
-			usvc->flags = 0;
-
-		/* set new flags from userland */
-		usvc->flags = (usvc->flags & ~flags.mask) |
-			      (flags.flags & flags.mask);
-		usvc->sched_name = nla_data(nla_sched);
-		usvc->timeout = nla_get_u32(nla_timeout);
-		usvc->netmask = nla_get_u32(nla_netmask);
-	}
-
-	return 0;
-}
-
-static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
-{
-	struct ip_vs_service_user_kern usvc;
-	int ret;
-
-	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (usvc.fwmark)
-		return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-	else
-		return __ip_vs_service_get(usvc.af, usvc.protocol,
-					   &usvc.addr, usvc.port);
-}
-
-static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
-{
-	struct nlattr *nl_dest;
-
-	nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
-	if (!nl_dest)
-		return -EMSGSIZE;
-
-	NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
-	NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
-
-	NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
-		    atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
-	NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
-	NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
-	NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
-	NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
-		    atomic_read(&dest->activeconns));
-	NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
-		    atomic_read(&dest->inactconns));
-	NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
-		    atomic_read(&dest->persistconns));
-
-	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
-		goto nla_put_failure;
-
-	nla_nest_end(skb, nl_dest);
-
-	return 0;
-
-nla_put_failure:
-	nla_nest_cancel(skb, nl_dest);
-	return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
-				struct netlink_callback *cb)
-{
-	void *hdr;
-
-	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-			  &ip_vs_genl_family, NLM_F_MULTI,
-			  IPVS_CMD_NEW_DEST);
-	if (!hdr)
-		return -EMSGSIZE;
-
-	if (ip_vs_genl_fill_dest(skb, dest) < 0)
-		goto nla_put_failure;
-
-	return genlmsg_end(skb, hdr);
-
-nla_put_failure:
-	genlmsg_cancel(skb, hdr);
-	return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_dests(struct sk_buff *skb,
-				 struct netlink_callback *cb)
-{
-	int idx = 0;
-	int start = cb->args[0];
-	struct ip_vs_service *svc;
-	struct ip_vs_dest *dest;
-	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
-
-	mutex_lock(&__ip_vs_mutex);
-
-	/* Try to find the service for which to dump destinations */
-	if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
-			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
-		goto out_err;
-
-	svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
-	if (IS_ERR(svc) || svc == NULL)
-		goto out_err;
-
-	/* Dump the destinations */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if (++idx <= start)
-			continue;
-		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
-			idx--;
-			goto nla_put_failure;
-		}
-	}
-
-nla_put_failure:
-	cb->args[0] = idx;
-	ip_vs_service_put(svc);
-
-out_err:
-	mutex_unlock(&__ip_vs_mutex);
-
-	return skb->len;
-}
-
-static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
-				 struct nlattr *nla, int full_entry)
-{
-	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
-	struct nlattr *nla_addr, *nla_port;
-
-	/* Parse mandatory identifying destination fields first */
-	if (nla == NULL ||
-	    nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
-		return -EINVAL;
-
-	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
-	nla_port	= attrs[IPVS_DEST_ATTR_PORT];
-
-	if (!(nla_addr && nla_port))
-		return -EINVAL;
-
-	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
-	udest->port = nla_get_u16(nla_port);
-
-	/* If a full entry was requested, check for the additional fields */
-	if (full_entry) {
-		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
-			      *nla_l_thresh;
-
-		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
-		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
-		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
-		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];
-
-		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
-			return -EINVAL;
-
-		udest->conn_flags = nla_get_u32(nla_fwd)
-				    & IP_VS_CONN_F_FWD_MASK;
-		udest->weight = nla_get_u32(nla_weight);
-		udest->u_threshold = nla_get_u32(nla_u_thresh);
-		udest->l_threshold = nla_get_u32(nla_l_thresh);
-	}
-
-	return 0;
-}
-
-static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
-				  const char *mcast_ifn, __be32 syncid)
-{
-	struct nlattr *nl_daemon;
-
-	nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
-	if (!nl_daemon)
-		return -EMSGSIZE;
-
-	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
-	NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
-	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
-
-	nla_nest_end(skb, nl_daemon);
-
-	return 0;
-
-nla_put_failure:
-	nla_nest_cancel(skb, nl_daemon);
-	return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
-				  const char *mcast_ifn, __be32 syncid,
-				  struct netlink_callback *cb)
-{
-	void *hdr;
-	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-			  &ip_vs_genl_family, NLM_F_MULTI,
-			  IPVS_CMD_NEW_DAEMON);
-	if (!hdr)
-		return -EMSGSIZE;
-
-	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
-		goto nla_put_failure;
-
-	return genlmsg_end(skb, hdr);
-
-nla_put_failure:
-	genlmsg_cancel(skb, hdr);
-	return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
-				   struct netlink_callback *cb)
-{
-	mutex_lock(&__ip_vs_mutex);
-	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
-		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
-					   ip_vs_master_mcast_ifn,
-					   ip_vs_master_syncid, cb) < 0)
-			goto nla_put_failure;
-
-		cb->args[0] = 1;
-	}
-
-	if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
-		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
-					   ip_vs_backup_mcast_ifn,
-					   ip_vs_backup_syncid, cb) < 0)
-			goto nla_put_failure;
-
-		cb->args[1] = 1;
-	}
-
-nla_put_failure:
-	mutex_unlock(&__ip_vs_mutex);
-
-	return skb->len;
-}
-
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
-{
-	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
-	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
-	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
-		return -EINVAL;
-
-	return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
-				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
-				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
-}
-
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
-{
-	if (!attrs[IPVS_DAEMON_ATTR_STATE])
-		return -EINVAL;
-
-	return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
-}
-
-static int ip_vs_genl_set_config(struct nlattr **attrs)
-{
-	struct ip_vs_timeout_user t;
-
-	__ip_vs_get_timeouts(&t);
-
-	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
-		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
-
-	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
-		t.tcp_fin_timeout =
-			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
-
-	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
-		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
-
-	return ip_vs_set_timeout(&t);
-}
-
-static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
-{
-	struct ip_vs_service *svc = NULL;
-	struct ip_vs_service_user_kern usvc;
-	struct ip_vs_dest_user_kern udest;
-	int ret = 0, cmd;
-	int need_full_svc = 0, need_full_dest = 0;
-
-	cmd = info->genlhdr->cmd;
-
-	mutex_lock(&__ip_vs_mutex);
-
-	if (cmd == IPVS_CMD_FLUSH) {
-		ret = ip_vs_flush();
-		goto out;
-	} else if (cmd == IPVS_CMD_SET_CONFIG) {
-		ret = ip_vs_genl_set_config(info->attrs);
-		goto out;
-	} else if (cmd == IPVS_CMD_NEW_DAEMON ||
-		   cmd == IPVS_CMD_DEL_DAEMON) {
-
-		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
-
-		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
-		    nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
-				     info->attrs[IPVS_CMD_ATTR_DAEMON],
-				     ip_vs_daemon_policy)) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		if (cmd == IPVS_CMD_NEW_DAEMON)
-			ret = ip_vs_genl_new_daemon(daemon_attrs);
-		else
-			ret = ip_vs_genl_del_daemon(daemon_attrs);
-		goto out;
-	} else if (cmd == IPVS_CMD_ZERO &&
-		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
-		ret = ip_vs_zero_all();
-		goto out;
-	}
-
-	/* All following commands require a service argument, so check if we
-	 * received a valid one. We need a full service specification when
-	 * adding / editing a service. Only identifying members otherwise. */
-	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
-		need_full_svc = 1;
-
-	ret = ip_vs_genl_parse_service(&usvc,
-				       info->attrs[IPVS_CMD_ATTR_SERVICE],
-				       need_full_svc);
-	if (ret)
-		goto out;
-
-	/* Lookup the exact service by <protocol, addr, port> or fwmark */
-	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
-	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
-	/* Unless we're adding a new service, the service must already exist */
-	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
-		ret = -ESRCH;
-		goto out;
-	}
-
-	/* Destination commands require a valid destination argument. For
-	 * adding / editing a destination, we need a full destination
-	 * specification. */
-	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
-	    cmd == IPVS_CMD_DEL_DEST) {
-		if (cmd != IPVS_CMD_DEL_DEST)
-			need_full_dest = 1;
-
-		ret = ip_vs_genl_parse_dest(&udest,
-					    info->attrs[IPVS_CMD_ATTR_DEST],
-					    need_full_dest);
-		if (ret)
-			goto out;
-	}
-
-	switch (cmd) {
-	case IPVS_CMD_NEW_SERVICE:
-		if (svc == NULL)
-			ret = ip_vs_add_service(&usvc, &svc);
-		else
-			ret = -EEXIST;
-		break;
-	case IPVS_CMD_SET_SERVICE:
-		ret = ip_vs_edit_service(svc, &usvc);
-		break;
-	case IPVS_CMD_DEL_SERVICE:
-		ret = ip_vs_del_service(svc);
-		break;
-	case IPVS_CMD_NEW_DEST:
-		ret = ip_vs_add_dest(svc, &udest);
-		break;
-	case IPVS_CMD_SET_DEST:
-		ret = ip_vs_edit_dest(svc, &udest);
-		break;
-	case IPVS_CMD_DEL_DEST:
-		ret = ip_vs_del_dest(svc, &udest);
-		break;
-	case IPVS_CMD_ZERO:
-		ret = ip_vs_zero_service(svc);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-out:
-	if (svc)
-		ip_vs_service_put(svc);
-	mutex_unlock(&__ip_vs_mutex);
-
-	return ret;
-}
-
-static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
-{
-	struct sk_buff *msg;
-	void *reply;
-	int ret, cmd, reply_cmd;
-
-	cmd = info->genlhdr->cmd;
-
-	if (cmd == IPVS_CMD_GET_SERVICE)
-		reply_cmd = IPVS_CMD_NEW_SERVICE;
-	else if (cmd == IPVS_CMD_GET_INFO)
-		reply_cmd = IPVS_CMD_SET_INFO;
-	else if (cmd == IPVS_CMD_GET_CONFIG)
-		reply_cmd = IPVS_CMD_SET_CONFIG;
-	else {
-		IP_VS_ERR("unknown Generic Netlink command\n");
-		return -EINVAL;
-	}
-
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
-
-	mutex_lock(&__ip_vs_mutex);
-
-	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
-	if (reply == NULL)
-		goto nla_put_failure;
-
-	switch (cmd) {
-	case IPVS_CMD_GET_SERVICE:
-	{
-		struct ip_vs_service *svc;
-
-		svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
-		if (IS_ERR(svc)) {
-			ret = PTR_ERR(svc);
-			goto out_err;
-		} else if (svc) {
-			ret = ip_vs_genl_fill_service(msg, svc);
-			ip_vs_service_put(svc);
-			if (ret)
-				goto nla_put_failure;
-		} else {
-			ret = -ESRCH;
-			goto out_err;
-		}
-
-		break;
-	}
-
-	case IPVS_CMD_GET_CONFIG:
-	{
-		struct ip_vs_timeout_user t;
-
-		__ip_vs_get_timeouts(&t);
-#ifdef CONFIG_IP_VS_PROTO_TCP
-		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
-		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
-			    t.tcp_fin_timeout);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
-		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
-#endif
-
-		break;
-	}
-
-	case IPVS_CMD_GET_INFO:
-		NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
-		NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
-			    IP_VS_CONN_TAB_SIZE);
-		break;
-	}
-
-	genlmsg_end(msg, reply);
-	ret = genlmsg_unicast(msg, info->snd_pid);
-	goto out;
-
-nla_put_failure:
-	IP_VS_ERR("not enough space in Netlink message\n");
-	ret = -EMSGSIZE;
-
-out_err:
-	nlmsg_free(msg);
-out:
-	mutex_unlock(&__ip_vs_mutex);
-
-	return ret;
-}
-
-
-static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
-	{
-		.cmd	= IPVS_CMD_NEW_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_SET_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_DEL_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_GET_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
-		.doit	= ip_vs_genl_get_cmd,
-		.dumpit	= ip_vs_genl_dump_services,
-		.policy	= ip_vs_cmd_policy,
-	},
-	{
-		.cmd	= IPVS_CMD_NEW_DEST,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_SET_DEST,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_DEL_DEST,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_GET_DEST,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.dumpit	= ip_vs_genl_dump_dests,
-	},
-	{
-		.cmd	= IPVS_CMD_NEW_DAEMON,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_DEL_DAEMON,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_GET_DAEMON,
-		.flags	= GENL_ADMIN_PERM,
-		.dumpit	= ip_vs_genl_dump_daemons,
-	},
-	{
-		.cmd	= IPVS_CMD_SET_CONFIG,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_GET_CONFIG,
-		.flags	= GENL_ADMIN_PERM,
-		.doit	= ip_vs_genl_get_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_GET_INFO,
-		.flags	= GENL_ADMIN_PERM,
-		.doit	= ip_vs_genl_get_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_ZERO,
-		.flags	= GENL_ADMIN_PERM,
-		.policy	= ip_vs_cmd_policy,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-	{
-		.cmd	= IPVS_CMD_FLUSH,
-		.flags	= GENL_ADMIN_PERM,
-		.doit	= ip_vs_genl_set_cmd,
-	},
-};
-
-static int __init ip_vs_genl_register(void)
-{
-	int ret, i;
-
-	ret = genl_register_family(&ip_vs_genl_family);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
-		ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
-		if (ret)
-			goto err_out;
-	}
-	return 0;
-
-err_out:
-	genl_unregister_family(&ip_vs_genl_family);
-	return ret;
-}
-
-static void ip_vs_genl_unregister(void)
-{
-	genl_unregister_family(&ip_vs_genl_family);
-}
-
-/* End of Generic Netlink interface definitions */
-
-
-int __init ip_vs_control_init(void)
-{
-	int ret;
-	int idx;
-
-	EnterFunction(2);
-
-	ret = nf_register_sockopt(&ip_vs_sockopts);
-	if (ret) {
-		IP_VS_ERR("cannot register sockopt.\n");
-		return ret;
-	}
-
-	ret = ip_vs_genl_register();
-	if (ret) {
-		IP_VS_ERR("cannot register Generic Netlink interface.\n");
-		nf_unregister_sockopt(&ip_vs_sockopts);
-		return ret;
-	}
-
-	proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
-	proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
-
-	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
-
-	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-	}
-	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
-	}
-
-	ip_vs_new_estimator(&ip_vs_stats);
-
-	/* Hook the defense timer */
-	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-
-	LeaveFunction(2);
-	return 0;
-}
-
-
-void ip_vs_control_cleanup(void)
-{
-	EnterFunction(2);
-	ip_vs_trash_cleanup();
-	cancel_rearming_delayed_work(&defense_work);
-	cancel_work_sync(&defense_work.work);
-	ip_vs_kill_estimator(&ip_vs_stats);
-	unregister_sysctl_table(sysctl_header);
-	proc_net_remove(&init_net, "ip_vs_stats");
-	proc_net_remove(&init_net, "ip_vs");
-	ip_vs_genl_unregister();
-	nf_unregister_sockopt(&ip_vs_sockopts);
-	LeaveFunction(2);
-}
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
deleted file mode 100644
index 2eb2860dabb..00000000000
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * ip_vs_est.c: simple rate estimator for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/sysctl.h>
-#include <linux/list.h>
-
-#include <net/ip_vs.h>
-
-/*
-  This code is to estimate rate in a shorter interval (such as 8
-  seconds) for virtual services and real servers. For measure rate in a
-  long interval, it is easy to implement a user level daemon which
-  periodically reads those statistical counters and measure rate.
-
-  Currently, the measurement is activated by slow timer handler. Hope
-  this measurement will not introduce too much load.
-
-  We measure rate during the last 8 seconds every 2 seconds:
-
-    avgrate = avgrate*(1-W) + rate*W
-
-    where W = 2^(-2)
-
-  NOTES.
-
-  * The stored value for average bps is scaled by 2^5, so that maximal
-    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
-
-  * A lot code is taken from net/sched/estimator.c
- */
-
-
-static void estimation_timer(unsigned long arg);
-
-static LIST_HEAD(est_list);
-static DEFINE_SPINLOCK(est_lock);
-static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
-
-static void estimation_timer(unsigned long arg)
-{
-	struct ip_vs_estimator *e;
-	struct ip_vs_stats *s;
-	u32 n_conns;
-	u32 n_inpkts, n_outpkts;
-	u64 n_inbytes, n_outbytes;
-	u32 rate;
-
-	spin_lock(&est_lock);
-	list_for_each_entry(e, &est_list, list) {
-		s = container_of(e, struct ip_vs_stats, est);
-
-		spin_lock(&s->lock);
-		n_conns = s->ustats.conns;
-		n_inpkts = s->ustats.inpkts;
-		n_outpkts = s->ustats.outpkts;
-		n_inbytes = s->ustats.inbytes;
-		n_outbytes = s->ustats.outbytes;
-
-		/* scaled by 2^10, but divided 2 seconds */
-		rate = (n_conns - e->last_conns)<<9;
-		e->last_conns = n_conns;
-		e->cps += ((long)rate - (long)e->cps)>>2;
-		s->ustats.cps = (e->cps+0x1FF)>>10;
-
-		rate = (n_inpkts - e->last_inpkts)<<9;
-		e->last_inpkts = n_inpkts;
-		e->inpps += ((long)rate - (long)e->inpps)>>2;
-		s->ustats.inpps = (e->inpps+0x1FF)>>10;
-
-		rate = (n_outpkts - e->last_outpkts)<<9;
-		e->last_outpkts = n_outpkts;
-		e->outpps += ((long)rate - (long)e->outpps)>>2;
-		s->ustats.outpps = (e->outpps+0x1FF)>>10;
-
-		rate = (n_inbytes - e->last_inbytes)<<4;
-		e->last_inbytes = n_inbytes;
-		e->inbps += ((long)rate - (long)e->inbps)>>2;
-		s->ustats.inbps = (e->inbps+0xF)>>5;
-
-		rate = (n_outbytes - e->last_outbytes)<<4;
-		e->last_outbytes = n_outbytes;
-		e->outbps += ((long)rate - (long)e->outbps)>>2;
-		s->ustats.outbps = (e->outbps+0xF)>>5;
-		spin_unlock(&s->lock);
-	}
-	spin_unlock(&est_lock);
-	mod_timer(&est_timer, jiffies + 2*HZ);
-}
-
-void ip_vs_new_estimator(struct ip_vs_stats *stats)
-{
-	struct ip_vs_estimator *est = &stats->est;
-
-	INIT_LIST_HEAD(&est->list);
-
-	est->last_conns = stats->ustats.conns;
-	est->cps = stats->ustats.cps<<10;
-
-	est->last_inpkts = stats->ustats.inpkts;
-	est->inpps = stats->ustats.inpps<<10;
-
-	est->last_outpkts = stats->ustats.outpkts;
-	est->outpps = stats->ustats.outpps<<10;
-
-	est->last_inbytes = stats->ustats.inbytes;
-	est->inbps = stats->ustats.inbps<<5;
-
-	est->last_outbytes = stats->ustats.outbytes;
-	est->outbps = stats->ustats.outbps<<5;
-
-	spin_lock_bh(&est_lock);
-	list_add(&est->list, &est_list);
-	spin_unlock_bh(&est_lock);
-}
-
-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
-{
-	struct ip_vs_estimator *est = &stats->est;
-
-	spin_lock_bh(&est_lock);
-	list_del(&est->list);
-	spin_unlock_bh(&est_lock);
-}
-
-void ip_vs_zero_estimator(struct ip_vs_stats *stats)
-{
-	struct ip_vs_estimator *est = &stats->est;
-
-	/* set counters zero, caller must hold the stats->lock lock */
-	est->last_inbytes = 0;
-	est->last_outbytes = 0;
-	est->last_conns = 0;
-	est->last_inpkts = 0;
-	est->last_outpkts = 0;
-	est->cps = 0;
-	est->inpps = 0;
-	est->outpps = 0;
-	est->inbps = 0;
-	est->outbps = 0;
-}
-
-int __init ip_vs_estimator_init(void)
-{
-	mod_timer(&est_timer, jiffies + 2 * HZ);
-	return 0;
-}
-
-void ip_vs_estimator_cleanup(void)
-{
-	del_timer_sync(&est_timer);
-}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
deleted file mode 100644
index 2e7dbd8b73a..00000000000
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * ip_vs_ftp.c: IPVS ftp application module
- *
- * Authors:	Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * Changes:
- *
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
- * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
- *
- *		IP_MASQ_FTP ftp masquerading module
- *
- * Version:	@(#)ip_masq_ftp.c 0.04   02/05/96
- *
- * Author:	Wouter Gadeyne
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <asm/unaligned.h>
-
-#include <net/ip_vs.h>
-
-
-#define SERVER_STRING "227 Entering Passive Mode ("
-#define CLIENT_STRING "PORT "
-
-
-/*
- * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
-module_param_array(ports, ushort, NULL, 0);
-MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
-
-
-/*	Dummy variable */
-static int ip_vs_ftp_pasv;
-
-
-static int
-ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
-{
-	return 0;
-}
-
-
-static int
-ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
-{
-	return 0;
-}
-
-
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern" and terminated with the "term" character.
- * <addr,port> is in network order.
- */
-static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
-				  const char *pattern, size_t plen, char term,
-				  __be32 *addr, __be16 *port,
-				  char **start, char **end)
-{
-	unsigned char p[6];
-	int i = 0;
-
-	if (data_limit - data < plen) {
-		/* check if there is partial match */
-		if (strnicmp(data, pattern, data_limit - data) == 0)
-			return -1;
-		else
-			return 0;
-	}
-
-	if (strnicmp(data, pattern, plen) != 0) {
-		return 0;
-	}
-	*start = data + plen;
-
-	for (data = *start; *data != term; data++) {
-		if (data == data_limit)
-			return -1;
-	}
-	*end = data;
-
-	memset(p, 0, sizeof(p));
-	for (data = *start; data != *end; data++) {
-		if (*data >= '0' && *data <= '9') {
-			p[i] = p[i]*10 + *data - '0';
-		} else if (*data == ',' && i < 5) {
-			i++;
-		} else {
-			/* unexpected character */
-			return -1;
-		}
-	}
-
-	if (i != 5)
-		return -1;
-
-	*addr = get_unaligned((__be32 *)p);
-	*port = get_unaligned((__be16 *)(p + 4));
-	return 1;
-}
-
-
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
- * from the server (inside-to-outside).
- * When we see one, we build a connection entry with the client address,
- * client port 0 (unknown at the moment), the server address and the
- * server port.  Mark the current connection entry as a control channel
- * of the new entry. All this work is just to make the data connection
- * can be scheduled to the right server later.
- *
- * The outgoing packet should be something like
- *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
- * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
- */
-static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			 struct sk_buff *skb, int *diff)
-{
-	struct iphdr *iph;
-	struct tcphdr *th;
-	char *data, *data_limit;
-	char *start, *end;
-	union nf_inet_addr from;
-	__be16 port;
-	struct ip_vs_conn *n_cp;
-	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
-	unsigned buf_len;
-	int ret;
-
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
-	*diff = 0;
-
-	/* Only useful for established sessions */
-	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
-		return 1;
-
-	/* Linear packets are much easier to deal with. */
-	if (!skb_make_writable(skb, skb->len))
-		return 0;
-
-	if (cp->app_data == &ip_vs_ftp_pasv) {
-		iph = ip_hdr(skb);
-		th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-		data = (char *)th + (th->doff << 2);
-		data_limit = skb_tail_pointer(skb);
-
-		if (ip_vs_ftp_get_addrport(data, data_limit,
-					   SERVER_STRING,
-					   sizeof(SERVER_STRING)-1, ')',
-					   &from.ip, &port,
-					   &start, &end) != 1)
-			return 1;
-
-		IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
-			  "%u.%u.%u.%u:%d detected\n",
-			  NIPQUAD(from.ip), ntohs(port),
-			  NIPQUAD(cp->caddr.ip), 0);
-
-		/*
-		 * Now update or create an connection entry for it
-		 */
-		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-					  &cp->caddr, 0);
-		if (!n_cp) {
-			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-					      &cp->caddr, 0,
-					      &cp->vaddr, port,
-					      &from, port,
-					      IP_VS_CONN_F_NO_CPORT,
-					      cp->dest);
-			if (!n_cp)
-				return 0;
-
-			/* add its controller */
-			ip_vs_control_add(n_cp, cp);
-		}
-
-		/*
-		 * Replace the old passive address with the new one
-		 */
-		from.ip = n_cp->vaddr.ip;
-		port = n_cp->vport;
-		sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
-			(ntohs(port)>>8)&255, ntohs(port)&255);
-		buf_len = strlen(buf);
-
-		/*
-		 * Calculate required delta-offset to keep TCP happy
-		 */
-		*diff = buf_len - (end-start);
-
-		if (*diff == 0) {
-			/* simply replace it with new passive address */
-			memcpy(start, buf, buf_len);
-			ret = 1;
-		} else {
-			ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
-					  end-start, buf, buf_len);
-		}
-
-		cp->app_data = NULL;
-		ip_vs_tcp_conn_listen(n_cp);
-		ip_vs_conn_put(n_cp);
-		return ret;
-	}
-	return 1;
-}
-
-
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
- * (outside-to-inside).
- *
- * The incoming packet having the PORT command should be something like
- *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
- * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
- * In this case, we create a connection entry using the client address and
- * port, so that the active ftp data connection from the server can reach
- * the client.
- */
-static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			struct sk_buff *skb, int *diff)
-{
-	struct iphdr *iph;
-	struct tcphdr *th;
-	char *data, *data_start, *data_limit;
-	char *start, *end;
-	union nf_inet_addr to;
-	__be16 port;
-	struct ip_vs_conn *n_cp;
-
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
-	/* no diff required for incoming packets */
-	*diff = 0;
-
-	/* Only useful for established sessions */
-	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
-		return 1;
-
-	/* Linear packets are much easier to deal with. */
-	if (!skb_make_writable(skb, skb->len))
-		return 0;
-
-	/*
-	 * Detecting whether it is passive
-	 */
-	iph = ip_hdr(skb);
-	th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
-	/* Since there may be OPTIONS in the TCP packet and the HLEN is
-	   the length of the header in 32-bit multiples, it is accurate
-	   to calculate data address by th+HLEN*4 */
-	data = data_start = (char *)th + (th->doff << 2);
-	data_limit = skb_tail_pointer(skb);
-
-	while (data <= data_limit - 6) {
-		if (strnicmp(data, "PASV\r\n", 6) == 0) {
-			/* Passive mode on */
-			IP_VS_DBG(7, "got PASV at %td of %td\n",
-				  data - data_start,
-				  data_limit - data_start);
-			cp->app_data = &ip_vs_ftp_pasv;
-			return 1;
-		}
-		data++;
-	}
-
-	/*
-	 * To support virtual FTP server, the scenerio is as follows:
-	 *       FTP client ----> Load Balancer ----> FTP server
-	 * First detect the port number in the application data,
-	 * then create a new connection entry for the coming data
-	 * connection.
-	 */
-	if (ip_vs_ftp_get_addrport(data_start, data_limit,
-				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
-				   '\r', &to.ip, &port,
-				   &start, &end) != 1)
-		return 1;
-
-	IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
-		  NIPQUAD(to.ip), ntohs(port));
-
-	/* Passive mode off */
-	cp->app_data = NULL;
-
-	/*
-	 * Now update or create a connection entry for it
-	 */
-	IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
-		  ip_vs_proto_name(iph->protocol),
-		  NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
-
-	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
-				 &to, port,
-				 &cp->vaddr, htons(ntohs(cp->vport)-1));
-	if (!n_cp) {
-		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-				      &to, port,
-				      &cp->vaddr, htons(ntohs(cp->vport)-1),
-				      &cp->daddr, htons(ntohs(cp->dport)-1),
-				      0,
-				      cp->dest);
-		if (!n_cp)
-			return 0;
-
-		/* add its controller */
-		ip_vs_control_add(n_cp, cp);
-	}
-
-	/*
-	 *	Move tunnel to listen state
-	 */
-	ip_vs_tcp_conn_listen(n_cp);
-	ip_vs_conn_put(n_cp);
-
-	return 1;
-}
-
-
-static struct ip_vs_app ip_vs_ftp = {
-	.name =		"ftp",
-	.type =		IP_VS_APP_TYPE_FTP,
-	.protocol =	IPPROTO_TCP,
-	.module =	THIS_MODULE,
-	.incs_list =	LIST_HEAD_INIT(ip_vs_ftp.incs_list),
-	.init_conn =	ip_vs_ftp_init_conn,
-	.done_conn =	ip_vs_ftp_done_conn,
-	.bind_conn =	NULL,
-	.unbind_conn =	NULL,
-	.pkt_out =	ip_vs_ftp_out,
-	.pkt_in =	ip_vs_ftp_in,
-};
-
-
-/*
- *	ip_vs_ftp initialization
- */
-static int __init ip_vs_ftp_init(void)
-{
-	int i, ret;
-	struct ip_vs_app *app = &ip_vs_ftp;
-
-	ret = register_ip_vs_app(app);
-	if (ret)
-		return ret;
-
-	for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
-		if (!ports[i])
-			continue;
-		ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
-		if (ret)
-			break;
-		IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
-			   app->name, i, ports[i]);
-	}
-
-	if (ret)
-		unregister_ip_vs_app(app);
-
-	return ret;
-}
-
-
-/*
- *	ip_vs_ftp finish.
- */
-static void __exit ip_vs_ftp_exit(void)
-{
-	unregister_ip_vs_app(&ip_vs_ftp);
-}
-
-
-module_init(ip_vs_ftp_init);
-module_exit(ip_vs_ftp_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
deleted file mode 100644
index 6ecef3518ca..00000000000
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
- * IPVS:        Locality-Based Least-Connection scheduling module
- *
- * Authors:     Wensong Zhang <wensong@gnuchina.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Martin Hamilton         :    fixed the terrible locking bugs
- *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
- *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
- *     Wensong Zhang           :    added doing full expiration check to
- *                                   collect stale entries of 24+ hours when
- *                                   no partial expire check in a half hour
- *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
- *                                   to avoid the possible race between timer
- *                                   handler and del_timer thread in SMP
- *
- */
-
-/*
- * The lblc algorithm is as follows (pseudo code):
- *
- *       if cachenode[dest_ip] is null then
- *               n, cachenode[dest_ip] <- {weighted least-conn node};
- *       else
- *               n <- cachenode[dest_ip];
- *               if (n is dead) OR
- *                  (n.conns>n.weight AND
- *                   there is a node m with m.conns<m.weight/2) then
- *                 n, cachenode[dest_ip] <- {weighted least-conn node};
- *
- *       return n;
- *
- * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
- * me to write this module.
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/jiffies.h>
-
-/* for sysctl */
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *    It is for garbage collection of stale IPVS lblc entries,
- *    when the table is full.
- */
-#define CHECK_EXPIRE_INTERVAL   (60*HZ)
-#define ENTRY_TIMEOUT           (6*60*HZ)
-
-/*
- *    It is for full expiration check.
- *    When there is no partial expiration check (garbage collection)
- *    in a half hour, do a full expiration check to collect stale
- *    entries that haven't been touched for a day.
- */
-#define COUNT_FOR_FULL_EXPIRATION   30
-static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
-
-
-/*
- *     for IPVS lblc entry hash table
- */
-#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
-#define CONFIG_IP_VS_LBLC_TAB_BITS      10
-#endif
-#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
-#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
-#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
-
-
-/*
- *      IPVS lblc entry represents an association between destination
- *      IP address and its destination server
- */
-struct ip_vs_lblc_entry {
-	struct list_head        list;
-	__be32                  addr;           /* destination IP address */
-	struct ip_vs_dest       *dest;          /* real server (cache) */
-	unsigned long           lastuse;        /* last used time */
-};
-
-
-/*
- *      IPVS lblc hash table
- */
-struct ip_vs_lblc_table {
-	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
-	atomic_t                entries;        /* number of entries */
-	int                     max_size;       /* maximum size of entries */
-	struct timer_list       periodic_timer; /* collect stale entries */
-	int                     rover;          /* rover for expire check */
-	int                     counter;        /* counter for no expire */
-};
-
-
-/*
- *      IPVS LBLC sysctl table
- */
-
-static ctl_table vs_vars_table[] = {
-	{
-		.procname	= "lblc_expiration",
-		.data		= &sysctl_ip_vs_lblc_expiration,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{ .ctl_name = 0 }
-};
-
-static struct ctl_table_header * sysctl_header;
-
-static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
-{
-	list_del(&en->list);
-	/*
-	 * We don't kfree dest because it is refered either by its service
-	 * or the trash dest list.
-	 */
-	atomic_dec(&en->dest->refcnt);
-	kfree(en);
-}
-
-
-/*
- *	Returns hash value for IPVS LBLC entry
- */
-static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
-{
-	return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
-}
-
-
-/*
- *	Hash an entry in the ip_vs_lblc_table.
- *	returns bool success.
- */
-static void
-ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
-{
-	unsigned hash = ip_vs_lblc_hashkey(en->addr);
-
-	list_add(&en->list, &tbl->bucket[hash]);
-	atomic_inc(&tbl->entries);
-}
-
-
-/*
- *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read
- *  lock
- */
-static inline struct ip_vs_lblc_entry *
-ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
-{
-	unsigned hash = ip_vs_lblc_hashkey(addr);
-	struct ip_vs_lblc_entry *en;
-
-	list_for_each_entry(en, &tbl->bucket[hash], list)
-		if (en->addr == addr)
-			return en;
-
-	return NULL;
-}
-
-
-/*
- * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
- * address to a server. Called under write lock.
- */
-static inline struct ip_vs_lblc_entry *
-ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
-	       struct ip_vs_dest *dest)
-{
-	struct ip_vs_lblc_entry *en;
-
-	en = ip_vs_lblc_get(tbl, daddr);
-	if (!en) {
-		en = kmalloc(sizeof(*en), GFP_ATOMIC);
-		if (!en) {
-			IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
-			return NULL;
-		}
-
-		en->addr = daddr;
-		en->lastuse = jiffies;
-
-		atomic_inc(&dest->refcnt);
-		en->dest = dest;
-
-		ip_vs_lblc_hash(tbl, en);
-	} else if (en->dest != dest) {
-		atomic_dec(&en->dest->refcnt);
-		atomic_inc(&dest->refcnt);
-		en->dest = dest;
-	}
-
-	return en;
-}
-
-
-/*
- *      Flush all the entries of the specified table.
- */
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
-{
-	struct ip_vs_lblc_entry *en, *nxt;
-	int i;
-
-	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
-			ip_vs_lblc_free(en);
-			atomic_dec(&tbl->entries);
-		}
-	}
-}
-
-
-static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
-{
-	struct ip_vs_lblc_table *tbl = svc->sched_data;
-	struct ip_vs_lblc_entry *en, *nxt;
-	unsigned long now = jiffies;
-	int i, j;
-
-	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
-		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
-
-		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-			if (time_before(now,
-					en->lastuse + sysctl_ip_vs_lblc_expiration))
-				continue;
-
-			ip_vs_lblc_free(en);
-			atomic_dec(&tbl->entries);
-		}
-		write_unlock(&svc->sched_lock);
-	}
-	tbl->rover = j;
-}
-
-
-/*
- *      Periodical timer handler for IPVS lblc table
- *      It is used to collect stale entries when the number of entries
- *      exceeds the maximum size of the table.
- *
- *      Fixme: we probably need more complicated algorithm to collect
- *             entries that have not been used for a long time even
- *             if the number of entries doesn't exceed the maximum size
- *             of the table.
- *      The full expiration check is for this purpose now.
- */
-static void ip_vs_lblc_check_expire(unsigned long data)
-{
-	struct ip_vs_service *svc = (struct ip_vs_service *) data;
-	struct ip_vs_lblc_table *tbl = svc->sched_data;
-	unsigned long now = jiffies;
-	int goal;
-	int i, j;
-	struct ip_vs_lblc_entry *en, *nxt;
-
-	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
-		/* do full expiration check */
-		ip_vs_lblc_full_check(svc);
-		tbl->counter = 1;
-		goto out;
-	}
-
-	if (atomic_read(&tbl->entries) <= tbl->max_size) {
-		tbl->counter++;
-		goto out;
-	}
-
-	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
-	if (goal > tbl->max_size/2)
-		goal = tbl->max_size/2;
-
-	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
-		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
-
-		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
-				continue;
-
-			ip_vs_lblc_free(en);
-			atomic_dec(&tbl->entries);
-			goal--;
-		}
-		write_unlock(&svc->sched_lock);
-		if (goal <= 0)
-			break;
-	}
-	tbl->rover = j;
-
-  out:
-	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
-}
-
-
-static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
-{
-	int i;
-	struct ip_vs_lblc_table *tbl;
-
-	/*
-	 *    Allocate the ip_vs_lblc_table for this service
-	 */
-	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
-	if (tbl == NULL) {
-		IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
-		return -ENOMEM;
-	}
-	svc->sched_data = tbl;
-	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
-		  "current service\n", sizeof(*tbl));
-
-	/*
-	 *    Initialize the hash buckets
-	 */
-	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
-		INIT_LIST_HEAD(&tbl->bucket[i]);
-	}
-	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
-	tbl->rover = 0;
-	tbl->counter = 1;
-
-	/*
-	 *    Hook periodic timer for garbage collection
-	 */
-	setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
-			(unsigned long)svc);
-	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
-
-	return 0;
-}
-
-
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_lblc_table *tbl = svc->sched_data;
-
-	/* remove periodic timer */
-	del_timer_sync(&tbl->periodic_timer);
-
-	/* got to clean up table entries here */
-	ip_vs_lblc_flush(tbl);
-
-	/* release the table itself */
-	kfree(tbl);
-	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
-		  sizeof(*tbl));
-
-	return 0;
-}
-
-
-static inline struct ip_vs_dest *
-__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
-{
-	struct ip_vs_dest *dest, *least;
-	int loh, doh;
-
-	/*
-	 * We think the overhead of processing active connections is fifty
-	 * times higher than that of inactive connections in average. (This
-	 * fifty times might not be accurate, we will change it later.) We
-	 * use the following formula to estimate the overhead:
-	 *                dest->activeconns*50 + dest->inactconns
-	 * and the load:
-	 *                (dest overhead) / dest->weight
-	 *
-	 * Remember -- no floats in kernel mode!!!
-	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
-	 *                h1/w1 > h2/w2
-	 * if every weight is larger than zero.
-	 *
-	 * The server with weight=0 is quiesced and will not receive any
-	 * new connection.
-	 */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-		if (atomic_read(&dest->weight) > 0) {
-			least = dest;
-			loh = atomic_read(&least->activeconns) * 50
-				+ atomic_read(&least->inactconns);
-			goto nextstage;
-		}
-	}
-	return NULL;
-
-	/*
-	 *    Find the destination with the least load.
-	 */
-  nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-
-		doh = atomic_read(&dest->activeconns) * 50
-			+ atomic_read(&dest->inactconns);
-		if (loh * atomic_read(&dest->weight) >
-		    doh * atomic_read(&least->weight)) {
-			least = dest;
-			loh = doh;
-		}
-	}
-
-	IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
-		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr.ip), ntohs(least->port),
-		  atomic_read(&least->activeconns),
-		  atomic_read(&least->refcnt),
-		  atomic_read(&least->weight), loh);
-
-	return least;
-}
-
-
-/*
- *   If this destination server is overloaded and there is a less loaded
- *   server, then return true.
- */
-static inline int
-is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
-	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
-		struct ip_vs_dest *d;
-
-		list_for_each_entry(d, &svc->destinations, n_list) {
-			if (atomic_read(&d->activeconns)*2
-			    < atomic_read(&d->weight)) {
-				return 1;
-			}
-		}
-	}
-	return 0;
-}
-
-
-/*
- *    Locality-Based (weighted) Least-Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_lblc_table *tbl = svc->sched_data;
-	struct iphdr *iph = ip_hdr(skb);
-	struct ip_vs_dest *dest = NULL;
-	struct ip_vs_lblc_entry *en;
-
-	IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
-
-	/* First look in our cache */
-	read_lock(&svc->sched_lock);
-	en = ip_vs_lblc_get(tbl, iph->daddr);
-	if (en) {
-		/* We only hold a read lock, but this is atomic */
-		en->lastuse = jiffies;
-
-		/*
-		 * If the destination is not available, i.e. it's in the trash,
-		 * we must ignore it, as it may be removed from under our feet,
-		 * if someone drops our reference count. Our caller only makes
-		 * sure that destinations, that are not in the trash, are not
-		 * moved to the trash, while we are scheduling. But anyone can
-		 * free up entries from the trash at any time.
-		 */
-
-		if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
-			dest = en->dest;
-	}
-	read_unlock(&svc->sched_lock);
-
-	/* If the destination has a weight and is not overloaded, use it */
-	if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
-		goto out;
-
-	/* No cache entry or it is invalid, time to schedule */
-	dest = __ip_vs_lblc_schedule(svc, iph);
-	if (!dest) {
-		IP_VS_DBG(1, "no destination available\n");
-		return NULL;
-	}
-
-	/* If we fail to create a cache entry, we'll just use the valid dest */
-	write_lock(&svc->sched_lock);
-	ip_vs_lblc_new(tbl, iph->daddr, dest);
-	write_unlock(&svc->sched_lock);
-
-out:
-	IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
-		  "--> server %u.%u.%u.%u:%d\n",
-		  NIPQUAD(iph->daddr),
-		  NIPQUAD(dest->addr.ip),
-		  ntohs(dest->port));
-
-	return dest;
-}
-
-
-/*
- *      IPVS LBLC Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_lblc_scheduler =
-{
-	.name =			"lblc",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	0,
-#endif
-	.init_service =		ip_vs_lblc_init_svc,
-	.done_service =		ip_vs_lblc_done_svc,
-	.schedule =		ip_vs_lblc_schedule,
-};
-
-
-static int __init ip_vs_lblc_init(void)
-{
-	int ret;
-
-	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
-	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
-	if (ret)
-		unregister_sysctl_table(sysctl_header);
-	return ret;
-}
-
-
-static void __exit ip_vs_lblc_cleanup(void)
-{
-	unregister_sysctl_table(sysctl_header);
-	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
-}
-
-
-module_init(ip_vs_lblc_init);
-module_exit(ip_vs_lblc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
deleted file mode 100644
index 1f75ea83bcf..00000000000
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * IPVS:        Locality-Based Least-Connection with Replication scheduler
- *
- * Authors:     Wensong Zhang <wensong@gnuchina.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Julian Anastasov        :    Added the missing (dest->weight>0)
- *                                  condition in the ip_vs_dest_set_max.
- *
- */
-
-/*
- * The lblc/r algorithm is as follows (pseudo code):
- *
- *       if serverSet[dest_ip] is null then
- *               n, serverSet[dest_ip] <- {weighted least-conn node};
- *       else
- *               n <- {least-conn (alive) node in serverSet[dest_ip]};
- *               if (n is null) OR
- *                  (n.conns>n.weight AND
- *                   there is a node m with m.conns<m.weight/2) then
- *                   n <- {weighted least-conn node};
- *                   add n to serverSet[dest_ip];
- *               if |serverSet[dest_ip]| > 1 AND
- *                   now - serverSet[dest_ip].lastMod > T then
- *                   m <- {most conn node in serverSet[dest_ip]};
- *                   remove m from serverSet[dest_ip];
- *       if serverSet[dest_ip] changed then
- *               serverSet[dest_ip].lastMod <- now;
- *
- *       return n;
- *
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/jiffies.h>
-
-/* for sysctl */
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-#include <net/net_namespace.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *    It is for garbage collection of stale IPVS lblcr entries,
- *    when the table is full.
- */
-#define CHECK_EXPIRE_INTERVAL   (60*HZ)
-#define ENTRY_TIMEOUT           (6*60*HZ)
-
-/*
- *    It is for full expiration check.
- *    When there is no partial expiration check (garbage collection)
- *    in a half hour, do a full expiration check to collect stale
- *    entries that haven't been touched for a day.
- */
-#define COUNT_FOR_FULL_EXPIRATION   30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
-
-/*
- *     for IPVS lblcr entry hash table
- */
-#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
-#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
-#endif
-#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
-#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
-#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
-
-
-/*
- *      IPVS destination set structure and operations
- */
-struct ip_vs_dest_list {
-	struct ip_vs_dest_list  *next;          /* list link */
-	struct ip_vs_dest       *dest;          /* destination server */
-};
-
-struct ip_vs_dest_set {
-	atomic_t                size;           /* set size */
-	unsigned long           lastmod;        /* last modified time */
-	struct ip_vs_dest_list  *list;          /* destination list */
-	rwlock_t	        lock;           /* lock for this list */
-};
-
-
-static struct ip_vs_dest_list *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
-{
-	struct ip_vs_dest_list *e;
-
-	for (e=set->list; e!=NULL; e=e->next) {
-		if (e->dest == dest)
-			/* already existed */
-			return NULL;
-	}
-
-	e = kmalloc(sizeof(*e), GFP_ATOMIC);
-	if (e == NULL) {
-		IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
-		return NULL;
-	}
-
-	atomic_inc(&dest->refcnt);
-	e->dest = dest;
-
-	/* link it to the list */
-	e->next = set->list;
-	set->list = e;
-	atomic_inc(&set->size);
-
-	set->lastmod = jiffies;
-	return e;
-}
-
-static void
-ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
-{
-	struct ip_vs_dest_list *e, **ep;
-
-	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
-		if (e->dest == dest) {
-			/* HIT */
-			*ep = e->next;
-			atomic_dec(&set->size);
-			set->lastmod = jiffies;
-			atomic_dec(&e->dest->refcnt);
-			kfree(e);
-			break;
-		}
-		ep = &e->next;
-	}
-}
-
-static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
-{
-	struct ip_vs_dest_list *e, **ep;
-
-	write_lock(&set->lock);
-	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
-		*ep = e->next;
-		/*
-		 * We don't kfree dest because it is refered either
-		 * by its service or by the trash dest list.
-		 */
-		atomic_dec(&e->dest->refcnt);
-		kfree(e);
-	}
-	write_unlock(&set->lock);
-}
-
-/* get weighted least-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
-{
-	register struct ip_vs_dest_list *e;
-	struct ip_vs_dest *dest, *least;
-	int loh, doh;
-
-	if (set == NULL)
-		return NULL;
-
-	/* select the first destination server, whose weight > 0 */
-	for (e=set->list; e!=NULL; e=e->next) {
-		least = e->dest;
-		if (least->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-
-		if ((atomic_read(&least->weight) > 0)
-		    && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
-			loh = atomic_read(&least->activeconns) * 50
-				+ atomic_read(&least->inactconns);
-			goto nextstage;
-		}
-	}
-	return NULL;
-
-	/* find the destination with the weighted least load */
-  nextstage:
-	for (e=e->next; e!=NULL; e=e->next) {
-		dest = e->dest;
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-
-		doh = atomic_read(&dest->activeconns) * 50
-			+ atomic_read(&dest->inactconns);
-		if ((loh * atomic_read(&dest->weight) >
-		     doh * atomic_read(&least->weight))
-		    && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-			least = dest;
-			loh = doh;
-		}
-	}
-
-	IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
-		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr.ip), ntohs(least->port),
-		  atomic_read(&least->activeconns),
-		  atomic_read(&least->refcnt),
-		  atomic_read(&least->weight), loh);
-	return least;
-}
-
-
-/* get weighted most-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
-{
-	register struct ip_vs_dest_list *e;
-	struct ip_vs_dest *dest, *most;
-	int moh, doh;
-
-	if (set == NULL)
-		return NULL;
-
-	/* select the first destination server, whose weight > 0 */
-	for (e=set->list; e!=NULL; e=e->next) {
-		most = e->dest;
-		if (atomic_read(&most->weight) > 0) {
-			moh = atomic_read(&most->activeconns) * 50
-				+ atomic_read(&most->inactconns);
-			goto nextstage;
-		}
-	}
-	return NULL;
-
-	/* find the destination with the weighted most load */
-  nextstage:
-	for (e=e->next; e!=NULL; e=e->next) {
-		dest = e->dest;
-		doh = atomic_read(&dest->activeconns) * 50
-			+ atomic_read(&dest->inactconns);
-		/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
-		if ((moh * atomic_read(&dest->weight) <
-		     doh * atomic_read(&most->weight))
-		    && (atomic_read(&dest->weight) > 0)) {
-			most = dest;
-			moh = doh;
-		}
-	}
-
-	IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
-		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(most->addr.ip), ntohs(most->port),
-		  atomic_read(&most->activeconns),
-		  atomic_read(&most->refcnt),
-		  atomic_read(&most->weight), moh);
-	return most;
-}
-
-
-/*
- *      IPVS lblcr entry represents an association between destination
- *      IP address and its destination server set
- */
-struct ip_vs_lblcr_entry {
-	struct list_head        list;
-	__be32                   addr;           /* destination IP address */
-	struct ip_vs_dest_set   set;            /* destination server set */
-	unsigned long           lastuse;        /* last used time */
-};
-
-
-/*
- *      IPVS lblcr hash table
- */
-struct ip_vs_lblcr_table {
-	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
-	atomic_t                entries;        /* number of entries */
-	int                     max_size;       /* maximum size of entries */
-	struct timer_list       periodic_timer; /* collect stale entries */
-	int                     rover;          /* rover for expire check */
-	int                     counter;        /* counter for no expire */
-};
-
-
-/*
- *      IPVS LBLCR sysctl table
- */
-
-static ctl_table vs_vars_table[] = {
-	{
-		.procname	= "lblcr_expiration",
-		.data		= &sysctl_ip_vs_lblcr_expiration,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-	},
-	{ .ctl_name = 0 }
-};
-
-static struct ctl_table_header * sysctl_header;
-
-static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
-{
-	list_del(&en->list);
-	ip_vs_dest_set_eraseall(&en->set);
-	kfree(en);
-}
-
-
-/*
- *	Returns hash value for IPVS LBLCR entry
- */
-static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
-{
-	return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
-}
-
-
-/*
- *	Hash an entry in the ip_vs_lblcr_table.
- *	returns bool success.
- */
-static void
-ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
-{
-	unsigned hash = ip_vs_lblcr_hashkey(en->addr);
-
-	list_add(&en->list, &tbl->bucket[hash]);
-	atomic_inc(&tbl->entries);
-}
-
-
-/*
- *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
- *  read lock.
- */
-static inline struct ip_vs_lblcr_entry *
-ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
-{
-	unsigned hash = ip_vs_lblcr_hashkey(addr);
-	struct ip_vs_lblcr_entry *en;
-
-	list_for_each_entry(en, &tbl->bucket[hash], list)
-		if (en->addr == addr)
-			return en;
-
-	return NULL;
-}
-
-
-/*
- * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server. Called under write lock.
- */
-static inline struct ip_vs_lblcr_entry *
-ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl,  __be32 daddr,
-		struct ip_vs_dest *dest)
-{
-	struct ip_vs_lblcr_entry *en;
-
-	en = ip_vs_lblcr_get(tbl, daddr);
-	if (!en) {
-		en = kmalloc(sizeof(*en), GFP_ATOMIC);
-		if (!en) {
-			IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
-			return NULL;
-		}
-
-		en->addr = daddr;
-		en->lastuse = jiffies;
-
-		/* initilize its dest set */
-		atomic_set(&(en->set.size), 0);
-		en->set.list = NULL;
-		rwlock_init(&en->set.lock);
-
-		ip_vs_lblcr_hash(tbl, en);
-	}
-
-	write_lock(&en->set.lock);
-	ip_vs_dest_set_insert(&en->set, dest);
-	write_unlock(&en->set.lock);
-
-	return en;
-}
-
-
-/*
- *      Flush all the entries of the specified table.
- */
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
-{
-	int i;
-	struct ip_vs_lblcr_entry *en, *nxt;
-
-	/* No locking required, only called during cleanup. */
-	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
-			ip_vs_lblcr_free(en);
-		}
-	}
-}
-
-
-static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
-{
-	struct ip_vs_lblcr_table *tbl = svc->sched_data;
-	unsigned long now = jiffies;
-	int i, j;
-	struct ip_vs_lblcr_entry *en, *nxt;
-
-	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
-
-		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-			if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
-				       now))
-				continue;
-
-			ip_vs_lblcr_free(en);
-			atomic_dec(&tbl->entries);
-		}
-		write_unlock(&svc->sched_lock);
-	}
-	tbl->rover = j;
-}
-
-
-/*
- *      Periodical timer handler for IPVS lblcr table
- *      It is used to collect stale entries when the number of entries
- *      exceeds the maximum size of the table.
- *
- *      Fixme: we probably need more complicated algorithm to collect
- *             entries that have not been used for a long time even
- *             if the number of entries doesn't exceed the maximum size
- *             of the table.
- *      The full expiration check is for this purpose now.
- */
-static void ip_vs_lblcr_check_expire(unsigned long data)
-{
-	struct ip_vs_service *svc = (struct ip_vs_service *) data;
-	struct ip_vs_lblcr_table *tbl = svc->sched_data;
-	unsigned long now = jiffies;
-	int goal;
-	int i, j;
-	struct ip_vs_lblcr_entry *en, *nxt;
-
-	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
-		/* do full expiration check */
-		ip_vs_lblcr_full_check(svc);
-		tbl->counter = 1;
-		goto out;
-	}
-
-	if (atomic_read(&tbl->entries) <= tbl->max_size) {
-		tbl->counter++;
-		goto out;
-	}
-
-	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
-	if (goal > tbl->max_size/2)
-		goal = tbl->max_size/2;
-
-	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
-
-		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
-				continue;
-
-			ip_vs_lblcr_free(en);
-			atomic_dec(&tbl->entries);
-			goal--;
-		}
-		write_unlock(&svc->sched_lock);
-		if (goal <= 0)
-			break;
-	}
-	tbl->rover = j;
-
-  out:
-	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
-}
-
-static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
-{
-	int i;
-	struct ip_vs_lblcr_table *tbl;
-
-	/*
-	 *    Allocate the ip_vs_lblcr_table for this service
-	 */
-	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
-	if (tbl == NULL) {
-		IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
-		return -ENOMEM;
-	}
-	svc->sched_data = tbl;
-	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
-		  "current service\n", sizeof(*tbl));
-
-	/*
-	 *    Initialize the hash buckets
-	 */
-	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-		INIT_LIST_HEAD(&tbl->bucket[i]);
-	}
-	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
-	tbl->rover = 0;
-	tbl->counter = 1;
-
-	/*
-	 *    Hook periodic timer for garbage collection
-	 */
-	setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
-			(unsigned long)svc);
-	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
-
-	return 0;
-}
-
-
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_lblcr_table *tbl = svc->sched_data;
-
-	/* remove periodic timer */
-	del_timer_sync(&tbl->periodic_timer);
-
-	/* got to clean up table entries here */
-	ip_vs_lblcr_flush(tbl);
-
-	/* release the table itself */
-	kfree(tbl);
-	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
-		  sizeof(*tbl));
-
-	return 0;
-}
-
-
-static inline struct ip_vs_dest *
-__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
-{
-	struct ip_vs_dest *dest, *least;
-	int loh, doh;
-
-	/*
-	 * We think the overhead of processing active connections is fifty
-	 * times higher than that of inactive connections in average. (This
-	 * fifty times might not be accurate, we will change it later.) We
-	 * use the following formula to estimate the overhead:
-	 *                dest->activeconns*50 + dest->inactconns
-	 * and the load:
-	 *                (dest overhead) / dest->weight
-	 *
-	 * Remember -- no floats in kernel mode!!!
-	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
-	 *                h1/w1 > h2/w2
-	 * if every weight is larger than zero.
-	 *
-	 * The server with weight=0 is quiesced and will not receive any
-	 * new connection.
-	 */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-
-		if (atomic_read(&dest->weight) > 0) {
-			least = dest;
-			loh = atomic_read(&least->activeconns) * 50
-				+ atomic_read(&least->inactconns);
-			goto nextstage;
-		}
-	}
-	return NULL;
-
-	/*
-	 *    Find the destination with the least load.
-	 */
-  nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-
-		doh = atomic_read(&dest->activeconns) * 50
-			+ atomic_read(&dest->inactconns);
-		if (loh * atomic_read(&dest->weight) >
-		    doh * atomic_read(&least->weight)) {
-			least = dest;
-			loh = doh;
-		}
-	}
-
-	IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
-		  "activeconns %d refcnt %d weight %d overhead %d\n",
-		  NIPQUAD(least->addr.ip), ntohs(least->port),
-		  atomic_read(&least->activeconns),
-		  atomic_read(&least->refcnt),
-		  atomic_read(&least->weight), loh);
-
-	return least;
-}
-
-
-/*
- *   If this destination server is overloaded and there is a less loaded
- *   server, then return true.
- */
-static inline int
-is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
-	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
-		struct ip_vs_dest *d;
-
-		list_for_each_entry(d, &svc->destinations, n_list) {
-			if (atomic_read(&d->activeconns)*2
-			    < atomic_read(&d->weight)) {
-				return 1;
-			}
-		}
-	}
-	return 0;
-}
-
-
-/*
- *    Locality-Based (weighted) Least-Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_lblcr_table *tbl = svc->sched_data;
-	struct iphdr *iph = ip_hdr(skb);
-	struct ip_vs_dest *dest = NULL;
-	struct ip_vs_lblcr_entry *en;
-
-	IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
-
-	/* First look in our cache */
-	read_lock(&svc->sched_lock);
-	en = ip_vs_lblcr_get(tbl, iph->daddr);
-	if (en) {
-		/* We only hold a read lock, but this is atomic */
-		en->lastuse = jiffies;
-
-		/* Get the least loaded destination */
-		read_lock(&en->set.lock);
-		dest = ip_vs_dest_set_min(&en->set);
-		read_unlock(&en->set.lock);
-
-		/* More than one destination + enough time passed by, cleanup */
-		if (atomic_read(&en->set.size) > 1 &&
-				time_after(jiffies, en->set.lastmod +
-				sysctl_ip_vs_lblcr_expiration)) {
-			struct ip_vs_dest *m;
-
-			write_lock(&en->set.lock);
-			m = ip_vs_dest_set_max(&en->set);
-			if (m)
-				ip_vs_dest_set_erase(&en->set, m);
-			write_unlock(&en->set.lock);
-		}
-
-		/* If the destination is not overloaded, use it */
-		if (dest && !is_overloaded(dest, svc)) {
-			read_unlock(&svc->sched_lock);
-			goto out;
-		}
-
-		/* The cache entry is invalid, time to schedule */
-		dest = __ip_vs_lblcr_schedule(svc, iph);
-		if (!dest) {
-			IP_VS_DBG(1, "no destination available\n");
-			read_unlock(&svc->sched_lock);
-			return NULL;
-		}
-
-		/* Update our cache entry */
-		write_lock(&en->set.lock);
-		ip_vs_dest_set_insert(&en->set, dest);
-		write_unlock(&en->set.lock);
-	}
-	read_unlock(&svc->sched_lock);
-
-	if (dest)
-		goto out;
-
-	/* No cache entry, time to schedule */
-	dest = __ip_vs_lblcr_schedule(svc, iph);
-	if (!dest) {
-		IP_VS_DBG(1, "no destination available\n");
-		return NULL;
-	}
-
-	/* If we fail to create a cache entry, we'll just use the valid dest */
-	write_lock(&svc->sched_lock);
-	ip_vs_lblcr_new(tbl, iph->daddr, dest);
-	write_unlock(&svc->sched_lock);
-
-out:
-	IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
-		  "--> server %u.%u.%u.%u:%d\n",
-		  NIPQUAD(iph->daddr),
-		  NIPQUAD(dest->addr.ip),
-		  ntohs(dest->port));
-
-	return dest;
-}
-
-
-/*
- *      IPVS LBLCR Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
-{
-	.name =			"lblcr",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	0,
-#endif
-	.init_service =		ip_vs_lblcr_init_svc,
-	.done_service =		ip_vs_lblcr_done_svc,
-	.schedule =		ip_vs_lblcr_schedule,
-};
-
-
-static int __init ip_vs_lblcr_init(void)
-{
-	int ret;
-
-	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
-	ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
-	if (ret)
-		unregister_sysctl_table(sysctl_header);
-	return ret;
-}
-
-
-static void __exit ip_vs_lblcr_cleanup(void)
-{
-	unregister_sysctl_table(sysctl_header);
-	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
-}
-
-
-module_init(ip_vs_lblcr_init);
-module_exit(ip_vs_lblcr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
deleted file mode 100644
index b69f808ac46..00000000000
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * IPVS:        Least-Connection Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Wensong Zhang            :     added the ip_vs_lc_update_svc
- *     Wensong Zhang            :     added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
-{
-	/*
-	 * We think the overhead of processing active connections is 256
-	 * times higher than that of inactive connections in average. (This
-	 * 256 times might not be accurate, we will change it later) We
-	 * use the following formula to estimate the overhead now:
-	 *		  dest->activeconns*256 + dest->inactconns
-	 */
-	return (atomic_read(&dest->activeconns) << 8) +
-		atomic_read(&dest->inactconns);
-}
-
-
-/*
- *	Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest, *least = NULL;
-	unsigned int loh = 0, doh;
-
-	IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
-
-	/*
-	 * Simply select the server with the least number of
-	 *        (activeconns<<5) + inactconns
-	 * Except whose weight is equal to zero.
-	 * If the weight is equal to zero, it means that the server is
-	 * quiesced, the existing connections to the server still get
-	 * served, but no new connection is assigned to the server.
-	 */
-
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
-		    atomic_read(&dest->weight) == 0)
-			continue;
-		doh = ip_vs_lc_dest_overhead(dest);
-		if (!least || doh < loh) {
-			least = dest;
-			loh = doh;
-		}
-	}
-
-	if (least)
-	IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
-		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-		      atomic_read(&least->activeconns),
-		      atomic_read(&least->inactconns));
-
-	return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_lc_scheduler = {
-	.name =			"lc",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	1,
-#endif
-	.schedule =		ip_vs_lc_schedule,
-};
-
-
-static int __init ip_vs_lc_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
-}
-
-static void __exit ip_vs_lc_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
-}
-
-module_init(ip_vs_lc_init);
-module_exit(ip_vs_lc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
deleted file mode 100644
index 9a2d8033f08..00000000000
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * IPVS:        Never Queue scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The NQ algorithm adopts a two-speed model. When there is an idle server
- * available, the job will be sent to the idle server, instead of waiting
- * for a fast one. When there is no idle server available, the job will be
- * sent to the server that minimize its expected delay (The Shortest
- * Expected Delay scheduling algorithm).
- *
- * See the following paper for more information:
- * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
- * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
- * pages 986-994, 1988.
- *
- * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
- *
- * The difference between NQ and SED is that NQ can improve overall
- * system utilization.
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
-{
-	/*
-	 * We only use the active connection number in the cost
-	 * calculation here.
-	 */
-	return atomic_read(&dest->activeconns) + 1;
-}
-
-
-/*
- *	Weighted Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest, *least = NULL;
-	unsigned int loh = 0, doh;
-
-	IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
-
-	/*
-	 * We calculate the load of each dest server as follows:
-	 *	(server expected overhead) / dest->weight
-	 *
-	 * Remember -- no floats in kernel mode!!!
-	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
-	 *		  h1/w1 > h2/w2
-	 * if every weight is larger than zero.
-	 *
-	 * The server with weight=0 is quiesced and will not receive any
-	 * new connections.
-	 */
-
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
-		    !atomic_read(&dest->weight))
-			continue;
-
-		doh = ip_vs_nq_dest_overhead(dest);
-
-		/* return the server directly if it is idle */
-		if (atomic_read(&dest->activeconns) == 0) {
-			least = dest;
-			loh = doh;
-			goto out;
-		}
-
-		if (!least ||
-		    (loh * atomic_read(&dest->weight) >
-		     doh * atomic_read(&least->weight))) {
-			least = dest;
-			loh = doh;
-		}
-	}
-
-	if (!least)
-		return NULL;
-
-  out:
-	IP_VS_DBG_BUF(6, "NQ: server %s:%u "
-		      "activeconns %d refcnt %d weight %d overhead %d\n",
-		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-		      atomic_read(&least->activeconns),
-		      atomic_read(&least->refcnt),
-		      atomic_read(&least->weight), loh);
-
-	return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_nq_scheduler =
-{
-	.name =			"nq",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	1,
-#endif
-	.schedule =		ip_vs_nq_schedule,
-};
-
-
-static int __init ip_vs_nq_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
-}
-
-static void __exit ip_vs_nq_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
-}
-
-module_init(ip_vs_nq_init);
-module_exit(ip_vs_nq_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
deleted file mode 100644
index 0791f9e08fe..00000000000
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * ip_vs_proto.c: transport protocol load balancing support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * IPVS protocols can only be registered/unregistered when the ipvs
- * module is loaded/unloaded, so no lock is needed in accessing the
- * ipvs protocol table.
- */
-
-#define IP_VS_PROTO_TAB_SIZE		32	/* must be power of 2 */
-#define IP_VS_PROTO_HASH(proto)		((proto) & (IP_VS_PROTO_TAB_SIZE-1))
-
-static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
-
-
-/*
- *	register an ipvs protocol
- */
-static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
-{
-	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
-
-	pp->next = ip_vs_proto_table[hash];
-	ip_vs_proto_table[hash] = pp;
-
-	if (pp->init != NULL)
-		pp->init(pp);
-
-	return 0;
-}
-
-
-/*
- *	unregister an ipvs protocol
- */
-static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
-{
-	struct ip_vs_protocol **pp_p;
-	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
-
-	pp_p = &ip_vs_proto_table[hash];
-	for (; *pp_p; pp_p = &(*pp_p)->next) {
-		if (*pp_p == pp) {
-			*pp_p = pp->next;
-			if (pp->exit != NULL)
-				pp->exit(pp);
-			return 0;
-		}
-	}
-
-	return -ESRCH;
-}
-
-
-/*
- *	get ip_vs_protocol object by its proto.
- */
-struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
-{
-	struct ip_vs_protocol *pp;
-	unsigned hash = IP_VS_PROTO_HASH(proto);
-
-	for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
-		if (pp->protocol == proto)
-			return pp;
-	}
-
-	return NULL;
-}
-
-
-/*
- *	Propagate event for state change to all protocols
- */
-void ip_vs_protocol_timeout_change(int flags)
-{
-	struct ip_vs_protocol *pp;
-	int i;
-
-	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
-		for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
-			if (pp->timeout_change)
-				pp->timeout_change(pp, flags);
-		}
-	}
-}
-
-
-int *
-ip_vs_create_timeout_table(int *table, int size)
-{
-	return kmemdup(table, size, GFP_ATOMIC);
-}
-
-
-/*
- *	Set timeout value for state specified by name
- */
-int
-ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
-{
-	int i;
-
-	if (!table || !name || !to)
-		return -EINVAL;
-
-	for (i = 0; i < num; i++) {
-		if (strcmp(names[i], name))
-			continue;
-		table[i] = to * HZ;
-		return 0;
-	}
-	return -ENOENT;
-}
-
-
-const char * ip_vs_state_name(__u16 proto, int state)
-{
-	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
-
-	if (pp == NULL || pp->state_name == NULL)
-		return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
-	return pp->state_name(state);
-}
-
-
-static void
-ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
-			     const struct sk_buff *skb,
-			     int offset,
-			     const char *msg)
-{
-	char buf[128];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
-	else if (ih->frag_off & htons(IP_OFFSET))
-		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
-			pp->name, NIPQUAD(ih->saddr),
-			NIPQUAD(ih->daddr));
-	else {
-		__be16 _ports[2], *pptr
-;
-		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
-					  sizeof(_ports), _ports);
-		if (pptr == NULL)
-			sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
-				pp->name,
-				NIPQUAD(ih->saddr),
-				NIPQUAD(ih->daddr));
-		else
-			sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
-				pp->name,
-				NIPQUAD(ih->saddr),
-				ntohs(pptr[0]),
-				NIPQUAD(ih->daddr),
-				ntohs(pptr[1]));
-	}
-
-	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
-			     const struct sk_buff *skb,
-			     int offset,
-			     const char *msg)
-{
-	char buf[192];
-	struct ipv6hdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
-	else if (ih->nexthdr == IPPROTO_FRAGMENT)
-		sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
-			pp->name, NIP6(ih->saddr),
-			NIP6(ih->daddr));
-	else {
-		__be16 _ports[2], *pptr;
-
-		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
-					  sizeof(_ports), _ports);
-		if (pptr == NULL)
-			sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
-				pp->name,
-				NIP6(ih->saddr),
-				NIP6(ih->daddr));
-		else
-			sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
-				pp->name,
-				NIP6(ih->saddr),
-				ntohs(pptr[0]),
-				NIP6(ih->daddr),
-				ntohs(pptr[1]));
-	}
-
-	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-#endif
-
-
-void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
-			  const struct sk_buff *skb,
-			  int offset,
-			  const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
-		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
-	else
-#endif
-		ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
-int __init ip_vs_protocol_init(void)
-{
-	char protocols[64];
-#define REGISTER_PROTOCOL(p)			\
-	do {					\
-		register_ip_vs_protocol(p);	\
-		strcat(protocols, ", ");	\
-		strcat(protocols, (p)->name);	\
-	} while (0)
-
-	protocols[0] = '\0';
-	protocols[2] = '\0';
-#ifdef CONFIG_IP_VS_PROTO_TCP
-	REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
-	REGISTER_PROTOCOL(&ip_vs_protocol_udp);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_AH
-	REGISTER_PROTOCOL(&ip_vs_protocol_ah);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_ESP
-	REGISTER_PROTOCOL(&ip_vs_protocol_esp);
-#endif
-	IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
-
-	return 0;
-}
-
-
-void ip_vs_protocol_cleanup(void)
-{
-	struct ip_vs_protocol *pp;
-	int i;
-
-	/* unregister all the ipvs protocols */
-	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
-		while ((pp = ip_vs_proto_table[i]) != NULL)
-			unregister_ip_vs_protocol(pp);
-	}
-}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
deleted file mode 100644
index 80ab0c8e5b4..00000000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * ip_vs_proto_ah_esp.c:	AH/ESP IPSec load balancing support for IPVS
- *
- * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
- *		Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		version 2 as published by the Free Software Foundation;
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/* TODO:
-
-struct isakmp_hdr {
-	__u8		icookie[8];
-	__u8		rcookie[8];
-	__u8		np;
-	__u8		version;
-	__u8		xchgtype;
-	__u8		flags;
-	__u32		msgid;
-	__u32		length;
-};
-
-*/
-
-#define PORT_ISAKMP	500
-
-
-static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		   const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		   int inverse)
-{
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->saddr,
-				       htons(PORT_ISAKMP),
-				       &iph->daddr,
-				       htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->daddr,
-				       htons(PORT_ISAKMP),
-				       &iph->saddr,
-				       htons(PORT_ISAKMP));
-	}
-
-	if (!cp) {
-		/*
-		 * We are not sure if the packet is from our
-		 * service, so our conn_schedule hook should return NF_ACCEPT
-		 */
-		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
-			      "%s%s %s->%s\n",
-			      inverse ? "ICMP+" : "",
-			      pp->name,
-			      IP_VS_DBG_ADDR(af, &iph->saddr),
-			      IP_VS_DBG_ADDR(af, &iph->daddr));
-	}
-
-	return cp;
-}
-
-
-static struct ip_vs_conn *
-ah_esp_conn_out_get(int af, const struct sk_buff *skb,
-		    struct ip_vs_protocol *pp,
-		    const struct ip_vs_iphdr *iph,
-		    unsigned int proto_off,
-		    int inverse)
-{
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->saddr,
-					htons(PORT_ISAKMP),
-					&iph->daddr,
-					htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->daddr,
-					htons(PORT_ISAKMP),
-					&iph->saddr,
-					htons(PORT_ISAKMP));
-	}
-
-	if (!cp) {
-		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
-			      "%s%s %s->%s\n",
-			      inverse ? "ICMP+" : "",
-			      pp->name,
-			      IP_VS_DBG_ADDR(af, &iph->saddr),
-			      IP_VS_DBG_ADDR(af, &iph->daddr));
-	}
-
-	return cp;
-}
-
-
-static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-		     int *verdict, struct ip_vs_conn **cpp)
-{
-	/*
-	 * AH/ESP is only related traffic. Pass the packet to IP stack.
-	 */
-	*verdict = NF_ACCEPT;
-	return 0;
-}
-
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
-	else
-		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
-			pp->name, NIPQUAD(ih->saddr),
-			NIPQUAD(ih->daddr));
-
-	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct ipv6hdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
-	else
-		sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
-			pp->name, NIP6(ih->saddr),
-			NIP6(ih->daddr));
-
-	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		    int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
-		ah_esp_debug_packet_v6(pp, skb, offset, msg);
-	else
-#endif
-		ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-#ifdef CONFIG_IP_VS_PROTO_AH
-struct ip_vs_protocol ip_vs_protocol_ah = {
-	.name =			"AH",
-	.protocol =		IPPROTO_AH,
-	.num_states =		1,
-	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
-	.conn_schedule =	ah_esp_conn_schedule,
-	.conn_in_get =		ah_esp_conn_in_get,
-	.conn_out_get =		ah_esp_conn_out_get,
-	.snat_handler =		NULL,
-	.dnat_handler =		NULL,
-	.csum_check =		NULL,
-	.state_transition =	NULL,
-	.register_app =		NULL,
-	.unregister_app =	NULL,
-	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
-	.timeout_change =	NULL,		/* ISAKMP */
-	.set_state_timeout =	NULL,
-};
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_ESP
-struct ip_vs_protocol ip_vs_protocol_esp = {
-	.name =			"ESP",
-	.protocol =		IPPROTO_ESP,
-	.num_states =		1,
-	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
-	.conn_schedule =	ah_esp_conn_schedule,
-	.conn_in_get =		ah_esp_conn_in_get,
-	.conn_out_get =		ah_esp_conn_out_get,
-	.snat_handler =		NULL,
-	.dnat_handler =		NULL,
-	.csum_check =		NULL,
-	.state_transition =	NULL,
-	.register_app =		NULL,
-	.unregister_app =	NULL,
-	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
-	.timeout_change =	NULL,		/* ISAKMP */
-};
-#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
deleted file mode 100644
index dd4566ea2bf..00000000000
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ /dev/null
@@ -1,732 +0,0 @@
-/*
- * ip_vs_proto_tcp.c:	TCP load balancing support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>                  /* for tcphdr */
-#include <net/ip.h>
-#include <net/tcp.h>                    /* for csum_tcpudp_magic */
-#include <net/ip6_checksum.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-static struct ip_vs_conn *
-tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		int inverse)
-{
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->saddr, pptr[0],
-					 &iph->daddr, pptr[1]);
-	} else {
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->daddr, pptr[1],
-					 &iph->saddr, pptr[0]);
-	}
-}
-
-static struct ip_vs_conn *
-tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		 int inverse)
-{
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->saddr, pptr[0],
-					  &iph->daddr, pptr[1]);
-	} else {
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->daddr, pptr[1],
-					  &iph->saddr, pptr[0]);
-	}
-}
-
-
-static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-		  int *verdict, struct ip_vs_conn **cpp)
-{
-	struct ip_vs_service *svc;
-	struct tcphdr _tcph, *th;
-	struct ip_vs_iphdr iph;
-
-	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
-	th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
-	if (th == NULL) {
-		*verdict = NF_DROP;
-		return 0;
-	}
-
-	if (th->syn &&
-	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
-				     th->dest))) {
-		if (ip_vs_todrop()) {
-			/*
-			 * It seems that we are very loaded.
-			 * We have to drop this packet :(
-			 */
-			ip_vs_service_put(svc);
-			*verdict = NF_DROP;
-			return 0;
-		}
-
-		/*
-		 * Let the virtual server select a real server for the
-		 * incoming connection, and create a connection entry.
-		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
-			*verdict = ip_vs_leave(svc, skb, pp);
-			return 0;
-		}
-		ip_vs_service_put(svc);
-	}
-	return 1;
-}
-
-
-static inline void
-tcp_fast_csum_update(int af, struct tcphdr *tcph,
-		     const union nf_inet_addr *oldip,
-		     const union nf_inet_addr *newip,
-		     __be16 oldport, __be16 newport)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		tcph->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-					 ip_vs_check_diff2(oldport, newport,
-						~csum_unfold(tcph->check))));
-	else
-#endif
-	tcph->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-				 ip_vs_check_diff2(oldport, newport,
-						~csum_unfold(tcph->check))));
-}
-
-
-static inline void
-tcp_partial_csum_update(int af, struct tcphdr *tcph,
-		     const union nf_inet_addr *oldip,
-		     const union nf_inet_addr *newip,
-		     __be16 oldlen, __be16 newlen)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		tcph->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
-	else
-#endif
-	tcph->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
-}
-
-
-static int
-tcp_snat_handler(struct sk_buff *skb,
-		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-	struct tcphdr *tcph;
-	unsigned int tcphoff;
-	int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (cp->af == AF_INET6)
-		tcphoff = sizeof(struct ipv6hdr);
-	else
-#endif
-		tcphoff = ip_hdrlen(skb);
-	oldlen = skb->len - tcphoff;
-
-	/* csum_check requires unshared skb */
-	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
-		return 0;
-
-	if (unlikely(cp->app != NULL)) {
-		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-			return 0;
-
-		/* Call application helper if needed */
-		if (!ip_vs_app_pkt_out(cp, skb))
-			return 0;
-	}
-
-	tcph = (void *)skb_network_header(skb) + tcphoff;
-	tcph->source = cp->vport;
-
-	/* Adjust TCP checksums */
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
-					htonl(oldlen),
-					htonl(skb->len - tcphoff));
-	} else if (!cp->app) {
-		/* Only port and addr are changed, do fast csum update */
-		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
-				     cp->dport, cp->vport);
-		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
-	} else {
-		/* full checksum calculation */
-		tcph->check = 0;
-		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-		if (cp->af == AF_INET6)
-			tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
-						      &cp->caddr.in6,
-						      skb->len - tcphoff,
-						      cp->protocol, skb->csum);
-		else
-#endif
-			tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
-							cp->caddr.ip,
-							skb->len - tcphoff,
-							cp->protocol,
-							skb->csum);
-
-		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
-			  pp->name, tcph->check,
-			  (char*)&(tcph->check) - (char*)tcph);
-	}
-	return 1;
-}
-
-
-static int
-tcp_dnat_handler(struct sk_buff *skb,
-		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-	struct tcphdr *tcph;
-	unsigned int tcphoff;
-	int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (cp->af == AF_INET6)
-		tcphoff = sizeof(struct ipv6hdr);
-	else
-#endif
-		tcphoff = ip_hdrlen(skb);
-	oldlen = skb->len - tcphoff;
-
-	/* csum_check requires unshared skb */
-	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
-		return 0;
-
-	if (unlikely(cp->app != NULL)) {
-		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-			return 0;
-
-		/*
-		 *	Attempt ip_vs_app call.
-		 *	It will fix ip_vs_conn and iph ack_seq stuff
-		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
-			return 0;
-	}
-
-	tcph = (void *)skb_network_header(skb) + tcphoff;
-	tcph->dest = cp->dport;
-
-	/*
-	 *	Adjust TCP checksums
-	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
-					htonl(oldlen),
-					htonl(skb->len - tcphoff));
-	} else if (!cp->app) {
-		/* Only port and addr are changed, do fast csum update */
-		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
-				     cp->vport, cp->dport);
-		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
-	} else {
-		/* full checksum calculation */
-		tcph->check = 0;
-		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-		if (cp->af == AF_INET6)
-			tcph->check = csum_ipv6_magic(&cp->caddr.in6,
-						      &cp->daddr.in6,
-						      skb->len - tcphoff,
-						      cp->protocol, skb->csum);
-		else
-#endif
-			tcph->check = csum_tcpudp_magic(cp->caddr.ip,
-							cp->daddr.ip,
-							skb->len - tcphoff,
-							cp->protocol,
-							skb->csum);
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	}
-	return 1;
-}
-
-
-static int
-tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
-{
-	unsigned int tcphoff;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		tcphoff = sizeof(struct ipv6hdr);
-	else
-#endif
-		tcphoff = ip_hdrlen(skb);
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_NONE:
-		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-	case CHECKSUM_COMPLETE:
-#ifdef CONFIG_IP_VS_IPV6
-		if (af == AF_INET6) {
-			if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-					    &ipv6_hdr(skb)->daddr,
-					    skb->len - tcphoff,
-					    ipv6_hdr(skb)->nexthdr,
-					    skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-						 "Failed checksum for");
-				return 0;
-			}
-		} else
-#endif
-			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
-					      ip_hdr(skb)->daddr,
-					      skb->len - tcphoff,
-					      ip_hdr(skb)->protocol,
-					      skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-						 "Failed checksum for");
-				return 0;
-			}
-		break;
-	default:
-		/* No need to checksum. */
-		break;
-	}
-
-	return 1;
-}
-
-
-#define TCP_DIR_INPUT		0
-#define TCP_DIR_OUTPUT		4
-#define TCP_DIR_INPUT_ONLY	8
-
-static const int tcp_state_off[IP_VS_DIR_LAST] = {
-	[IP_VS_DIR_INPUT]		=	TCP_DIR_INPUT,
-	[IP_VS_DIR_OUTPUT]		=	TCP_DIR_OUTPUT,
-	[IP_VS_DIR_INPUT_ONLY]		=	TCP_DIR_INPUT_ONLY,
-};
-
-/*
- *	Timeout table[state]
- */
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
-	[IP_VS_TCP_S_NONE]		=	2*HZ,
-	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
-	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
-	[IP_VS_TCP_S_SYN_RECV]		=	1*60*HZ,
-	[IP_VS_TCP_S_FIN_WAIT]		=	2*60*HZ,
-	[IP_VS_TCP_S_TIME_WAIT]		=	2*60*HZ,
-	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
-	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
-	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
-	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
-	[IP_VS_TCP_S_SYNACK]		=	120*HZ,
-	[IP_VS_TCP_S_LAST]		=	2*HZ,
-};
-
-static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
-	[IP_VS_TCP_S_NONE]		=	"NONE",
-	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
-	[IP_VS_TCP_S_SYN_SENT]		=	"SYN_SENT",
-	[IP_VS_TCP_S_SYN_RECV]		=	"SYN_RECV",
-	[IP_VS_TCP_S_FIN_WAIT]		=	"FIN_WAIT",
-	[IP_VS_TCP_S_TIME_WAIT]		=	"TIME_WAIT",
-	[IP_VS_TCP_S_CLOSE]		=	"CLOSE",
-	[IP_VS_TCP_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
-	[IP_VS_TCP_S_LAST_ACK]		=	"LAST_ACK",
-	[IP_VS_TCP_S_LISTEN]		=	"LISTEN",
-	[IP_VS_TCP_S_SYNACK]		=	"SYNACK",
-	[IP_VS_TCP_S_LAST]		=	"BUG!",
-};
-
-#define sNO IP_VS_TCP_S_NONE
-#define sES IP_VS_TCP_S_ESTABLISHED
-#define sSS IP_VS_TCP_S_SYN_SENT
-#define sSR IP_VS_TCP_S_SYN_RECV
-#define sFW IP_VS_TCP_S_FIN_WAIT
-#define sTW IP_VS_TCP_S_TIME_WAIT
-#define sCL IP_VS_TCP_S_CLOSE
-#define sCW IP_VS_TCP_S_CLOSE_WAIT
-#define sLA IP_VS_TCP_S_LAST_ACK
-#define sLI IP_VS_TCP_S_LISTEN
-#define sSA IP_VS_TCP_S_SYNACK
-
-struct tcp_states_t {
-	int next_state[IP_VS_TCP_S_LAST];
-};
-
-static const char * tcp_state_name(int state)
-{
-	if (state >= IP_VS_TCP_S_LAST)
-		return "ERR!";
-	return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
-}
-
-static struct tcp_states_t tcp_states [] = {
-/*	INPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
-/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
-
-/*	OUTPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
-/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
-/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
-/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
-/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
-
-/*	INPUT-ONLY */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
-/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-};
-
-static struct tcp_states_t tcp_states_dos [] = {
-/*	INPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
-/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
-/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-
-/*	OUTPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
-/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
-/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
-/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
-/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
-
-/*	INPUT-ONLY */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
-/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
-/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-};
-
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-	int on = (flags & 1);		/* secure_tcp */
-
-	/*
-	** FIXME: change secure_tcp to independent sysctl var
-	** or make it per-service or per-app because it is valid
-	** for most if not for all of the applications. Something
-	** like "capabilities" (flags) for each object.
-	*/
-	tcp_state_table = (on? tcp_states_dos : tcp_states);
-}
-
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
-				       tcp_state_name_table, sname, to);
-}
-
-static inline int tcp_state_idx(struct tcphdr *th)
-{
-	if (th->rst)
-		return 3;
-	if (th->syn)
-		return 0;
-	if (th->fin)
-		return 1;
-	if (th->ack)
-		return 2;
-	return -1;
-}
-
-static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
-	      int direction, struct tcphdr *th)
-{
-	int state_idx;
-	int new_state = IP_VS_TCP_S_CLOSE;
-	int state_off = tcp_state_off[direction];
-
-	/*
-	 *    Update state offset to INPUT_ONLY if necessary
-	 *    or delete NO_OUTPUT flag if output packet detected
-	 */
-	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
-		if (state_off == TCP_DIR_OUTPUT)
-			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
-		else
-			state_off = TCP_DIR_INPUT_ONLY;
-	}
-
-	if ((state_idx = tcp_state_idx(th)) < 0) {
-		IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
-		goto tcp_state_out;
-	}
-
-	new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
-
-  tcp_state_out:
-	if (new_state != cp->state) {
-		struct ip_vs_dest *dest = cp->dest;
-
-		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
-			      "%s:%d state: %s->%s conn->refcnt:%d\n",
-			      pp->name,
-			      ((state_off == TCP_DIR_OUTPUT) ?
-			       "output " : "input "),
-			      th->syn ? 'S' : '.',
-			      th->fin ? 'F' : '.',
-			      th->ack ? 'A' : '.',
-			      th->rst ? 'R' : '.',
-			      IP_VS_DBG_ADDR(cp->af, &cp->daddr),
-			      ntohs(cp->dport),
-			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
-			      ntohs(cp->cport),
-			      tcp_state_name(cp->state),
-			      tcp_state_name(new_state),
-			      atomic_read(&cp->refcnt));
-
-		if (dest) {
-			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-			    (new_state != IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_dec(&dest->activeconns);
-				atomic_inc(&dest->inactconns);
-				cp->flags |= IP_VS_CONN_F_INACTIVE;
-			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
-				   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_inc(&dest->activeconns);
-				atomic_dec(&dest->inactconns);
-				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
-			}
-		}
-	}
-
-	cp->timeout = pp->timeout_table[cp->state = new_state];
-}
-
-
-/*
- *	Handle state transitions
- */
-static int
-tcp_state_transition(struct ip_vs_conn *cp, int direction,
-		     const struct sk_buff *skb,
-		     struct ip_vs_protocol *pp)
-{
-	struct tcphdr _tcph, *th;
-
-#ifdef CONFIG_IP_VS_IPV6
-	int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
-#else
-	int ihl = ip_hdrlen(skb);
-#endif
-
-	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
-	if (th == NULL)
-		return 0;
-
-	spin_lock(&cp->lock);
-	set_tcp_state(pp, cp, direction, th);
-	spin_unlock(&cp->lock);
-
-	return 1;
-}
-
-
-/*
- *	Hash table for TCP application incarnations
- */
-#define	TCP_APP_TAB_BITS	4
-#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
-#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
-static inline __u16 tcp_app_hashkey(__be16 port)
-{
-	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
-		& TCP_APP_TAB_MASK;
-}
-
-
-static int tcp_register_app(struct ip_vs_app *inc)
-{
-	struct ip_vs_app *i;
-	__u16 hash;
-	__be16 port = inc->port;
-	int ret = 0;
-
-	hash = tcp_app_hashkey(port);
-
-	spin_lock_bh(&tcp_app_lock);
-	list_for_each_entry(i, &tcp_apps[hash], p_list) {
-		if (i->port == port) {
-			ret = -EEXIST;
-			goto out;
-		}
-	}
-	list_add(&inc->p_list, &tcp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_tcp.appcnt);
-
-  out:
-	spin_unlock_bh(&tcp_app_lock);
-	return ret;
-}
-
-
-static void
-tcp_unregister_app(struct ip_vs_app *inc)
-{
-	spin_lock_bh(&tcp_app_lock);
-	atomic_dec(&ip_vs_protocol_tcp.appcnt);
-	list_del(&inc->p_list);
-	spin_unlock_bh(&tcp_app_lock);
-}
-
-
-static int
-tcp_app_conn_bind(struct ip_vs_conn *cp)
-{
-	int hash;
-	struct ip_vs_app *inc;
-	int result = 0;
-
-	/* Default binding: bind app only for NAT */
-	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
-		return 0;
-
-	/* Lookup application incarnations and bind the right one */
-	hash = tcp_app_hashkey(cp->vport);
-
-	spin_lock(&tcp_app_lock);
-	list_for_each_entry(inc, &tcp_apps[hash], p_list) {
-		if (inc->port == cp->vport) {
-			if (unlikely(!ip_vs_app_inc_get(inc)))
-				break;
-			spin_unlock(&tcp_app_lock);
-
-			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
-				      "%s:%u to app %s on port %u\n",
-				      __func__,
-				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
-				      ntohs(cp->cport),
-				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
-				      ntohs(cp->vport),
-				      inc->name, ntohs(inc->port));
-
-			cp->app = inc;
-			if (inc->init_conn)
-				result = inc->init_conn(inc, cp);
-			goto out;
-		}
-	}
-	spin_unlock(&tcp_app_lock);
-
-  out:
-	return result;
-}
-
-
-/*
- *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
- */
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
-{
-	spin_lock(&cp->lock);
-	cp->state = IP_VS_TCP_S_LISTEN;
-	cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
-	spin_unlock(&cp->lock);
-}
-
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
-{
-	IP_VS_INIT_HASH_TABLE(tcp_apps);
-	pp->timeout_table = tcp_timeouts;
-}
-
-
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
-{
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_tcp = {
-	.name =			"TCP",
-	.protocol =		IPPROTO_TCP,
-	.num_states =		IP_VS_TCP_S_LAST,
-	.dont_defrag =		0,
-	.appcnt =		ATOMIC_INIT(0),
-	.init =			ip_vs_tcp_init,
-	.exit =			ip_vs_tcp_exit,
-	.register_app =		tcp_register_app,
-	.unregister_app =	tcp_unregister_app,
-	.conn_schedule =	tcp_conn_schedule,
-	.conn_in_get =		tcp_conn_in_get,
-	.conn_out_get =		tcp_conn_out_get,
-	.snat_handler =		tcp_snat_handler,
-	.dnat_handler =		tcp_dnat_handler,
-	.csum_check =		tcp_csum_check,
-	.state_name =		tcp_state_name,
-	.state_transition =	tcp_state_transition,
-	.app_conn_bind =	tcp_app_conn_bind,
-	.debug_packet =		ip_vs_tcpudp_debug_packet,
-	.timeout_change =	tcp_timeout_change,
-	.set_state_timeout =	tcp_set_state_timeout,
-};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
deleted file mode 100644
index 6eb6039d634..00000000000
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- * ip_vs_proto_udp.c:	UDP load balancing support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/udp.h>
-
-#include <net/ip_vs.h>
-#include <net/ip.h>
-#include <net/ip6_checksum.h>
-
-static struct ip_vs_conn *
-udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		int inverse)
-{
-	struct ip_vs_conn *cp;
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, iph->protocol,
-				       &iph->saddr, pptr[0],
-				       &iph->daddr, pptr[1]);
-	} else {
-		cp = ip_vs_conn_in_get(af, iph->protocol,
-				       &iph->daddr, pptr[1],
-				       &iph->saddr, pptr[0]);
-	}
-
-	return cp;
-}
-
-
-static struct ip_vs_conn *
-udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		 int inverse)
-{
-	struct ip_vs_conn *cp;
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, iph->protocol,
-					&iph->saddr, pptr[0],
-					&iph->daddr, pptr[1]);
-	} else {
-		cp = ip_vs_conn_out_get(af, iph->protocol,
-					&iph->daddr, pptr[1],
-					&iph->saddr, pptr[0]);
-	}
-
-	return cp;
-}
-
-
-static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-		  int *verdict, struct ip_vs_conn **cpp)
-{
-	struct ip_vs_service *svc;
-	struct udphdr _udph, *uh;
-	struct ip_vs_iphdr iph;
-
-	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
-	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
-	if (uh == NULL) {
-		*verdict = NF_DROP;
-		return 0;
-	}
-
-	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
-				&iph.daddr, uh->dest);
-	if (svc) {
-		if (ip_vs_todrop()) {
-			/*
-			 * It seems that we are very loaded.
-			 * We have to drop this packet :(
-			 */
-			ip_vs_service_put(svc);
-			*verdict = NF_DROP;
-			return 0;
-		}
-
-		/*
-		 * Let the virtual server select a real server for the
-		 * incoming connection, and create a connection entry.
-		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
-			*verdict = ip_vs_leave(svc, skb, pp);
-			return 0;
-		}
-		ip_vs_service_put(svc);
-	}
-	return 1;
-}
-
-
-static inline void
-udp_fast_csum_update(int af, struct udphdr *uhdr,
-		     const union nf_inet_addr *oldip,
-		     const union nf_inet_addr *newip,
-		     __be16 oldport, __be16 newport)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		uhdr->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-					 ip_vs_check_diff2(oldport, newport,
-						~csum_unfold(uhdr->check))));
-	else
-#endif
-		uhdr->check =
-			csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-					 ip_vs_check_diff2(oldport, newport,
-						~csum_unfold(uhdr->check))));
-	if (!uhdr->check)
-		uhdr->check = CSUM_MANGLED_0;
-}
-
-static inline void
-udp_partial_csum_update(int af, struct udphdr *uhdr,
-		     const union nf_inet_addr *oldip,
-		     const union nf_inet_addr *newip,
-		     __be16 oldlen, __be16 newlen)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		uhdr->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
-	else
-#endif
-	uhdr->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
-}
-
-
-static int
-udp_snat_handler(struct sk_buff *skb,
-		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-	struct udphdr *udph;
-	unsigned int udphoff;
-	int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (cp->af == AF_INET6)
-		udphoff = sizeof(struct ipv6hdr);
-	else
-#endif
-		udphoff = ip_hdrlen(skb);
-	oldlen = skb->len - udphoff;
-
-	/* csum_check requires unshared skb */
-	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
-		return 0;
-
-	if (unlikely(cp->app != NULL)) {
-		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-			return 0;
-
-		/*
-		 *	Call application helper if needed
-		 */
-		if (!ip_vs_app_pkt_out(cp, skb))
-			return 0;
-	}
-
-	udph = (void *)skb_network_header(skb) + udphoff;
-	udph->source = cp->vport;
-
-	/*
-	 *	Adjust UDP checksums
-	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
-					htonl(oldlen),
-					htonl(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
-		/* Only port and addr are changed, do fast csum update */
-		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
-				     cp->dport, cp->vport);
-		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
-	} else {
-		/* full checksum calculation */
-		udph->check = 0;
-		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-		if (cp->af == AF_INET6)
-			udph->check = csum_ipv6_magic(&cp->vaddr.in6,
-						      &cp->caddr.in6,
-						      skb->len - udphoff,
-						      cp->protocol, skb->csum);
-		else
-#endif
-			udph->check = csum_tcpudp_magic(cp->vaddr.ip,
-							cp->caddr.ip,
-							skb->len - udphoff,
-							cp->protocol,
-							skb->csum);
-		if (udph->check == 0)
-			udph->check = CSUM_MANGLED_0;
-		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
-			  pp->name, udph->check,
-			  (char*)&(udph->check) - (char*)udph);
-	}
-	return 1;
-}
-
-
-static int
-udp_dnat_handler(struct sk_buff *skb,
-		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-	struct udphdr *udph;
-	unsigned int udphoff;
-	int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (cp->af == AF_INET6)
-		udphoff = sizeof(struct ipv6hdr);
-	else
-#endif
-		udphoff = ip_hdrlen(skb);
-	oldlen = skb->len - udphoff;
-
-	/* csum_check requires unshared skb */
-	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
-		return 0;
-
-	if (unlikely(cp->app != NULL)) {
-		/* Some checks before mangling */
-		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-			return 0;
-
-		/*
-		 *	Attempt ip_vs_app call.
-		 *	It will fix ip_vs_conn
-		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
-			return 0;
-	}
-
-	udph = (void *)skb_network_header(skb) + udphoff;
-	udph->dest = cp->dport;
-
-	/*
-	 *	Adjust UDP checksums
-	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
-					htonl(oldlen),
-					htonl(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
-		/* Only port and addr are changed, do fast csum update */
-		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
-				     cp->vport, cp->dport);
-		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
-	} else {
-		/* full checksum calculation */
-		udph->check = 0;
-		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-		if (cp->af == AF_INET6)
-			udph->check = csum_ipv6_magic(&cp->caddr.in6,
-						      &cp->daddr.in6,
-						      skb->len - udphoff,
-						      cp->protocol, skb->csum);
-		else
-#endif
-			udph->check = csum_tcpudp_magic(cp->caddr.ip,
-							cp->daddr.ip,
-							skb->len - udphoff,
-							cp->protocol,
-							skb->csum);
-		if (udph->check == 0)
-			udph->check = CSUM_MANGLED_0;
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	}
-	return 1;
-}
-
-
-static int
-udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
-{
-	struct udphdr _udph, *uh;
-	unsigned int udphoff;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		udphoff = sizeof(struct ipv6hdr);
-	else
-#endif
-		udphoff = ip_hdrlen(skb);
-
-	uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
-	if (uh == NULL)
-		return 0;
-
-	if (uh->check != 0) {
-		switch (skb->ip_summed) {
-		case CHECKSUM_NONE:
-			skb->csum = skb_checksum(skb, udphoff,
-						 skb->len - udphoff, 0);
-		case CHECKSUM_COMPLETE:
-#ifdef CONFIG_IP_VS_IPV6
-			if (af == AF_INET6) {
-				if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-						    &ipv6_hdr(skb)->daddr,
-						    skb->len - udphoff,
-						    ipv6_hdr(skb)->nexthdr,
-						    skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-							 "Failed checksum for");
-					return 0;
-				}
-			} else
-#endif
-				if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
-						      ip_hdr(skb)->daddr,
-						      skb->len - udphoff,
-						      ip_hdr(skb)->protocol,
-						      skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-							 "Failed checksum for");
-					return 0;
-				}
-			break;
-		default:
-			/* No need to checksum. */
-			break;
-		}
-	}
-	return 1;
-}
-
-
-/*
- *	Note: the caller guarantees that only one of register_app,
- *	unregister_app or app_conn_bind is called each time.
- */
-
-#define	UDP_APP_TAB_BITS	4
-#define	UDP_APP_TAB_SIZE	(1 << UDP_APP_TAB_BITS)
-#define	UDP_APP_TAB_MASK	(UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
-static inline __u16 udp_app_hashkey(__be16 port)
-{
-	return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
-		& UDP_APP_TAB_MASK;
-}
-
-
-static int udp_register_app(struct ip_vs_app *inc)
-{
-	struct ip_vs_app *i;
-	__u16 hash;
-	__be16 port = inc->port;
-	int ret = 0;
-
-	hash = udp_app_hashkey(port);
-
-
-	spin_lock_bh(&udp_app_lock);
-	list_for_each_entry(i, &udp_apps[hash], p_list) {
-		if (i->port == port) {
-			ret = -EEXIST;
-			goto out;
-		}
-	}
-	list_add(&inc->p_list, &udp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_udp.appcnt);
-
-  out:
-	spin_unlock_bh(&udp_app_lock);
-	return ret;
-}
-
-
-static void
-udp_unregister_app(struct ip_vs_app *inc)
-{
-	spin_lock_bh(&udp_app_lock);
-	atomic_dec(&ip_vs_protocol_udp.appcnt);
-	list_del(&inc->p_list);
-	spin_unlock_bh(&udp_app_lock);
-}
-
-
-static int udp_app_conn_bind(struct ip_vs_conn *cp)
-{
-	int hash;
-	struct ip_vs_app *inc;
-	int result = 0;
-
-	/* Default binding: bind app only for NAT */
-	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
-		return 0;
-
-	/* Lookup application incarnations and bind the right one */
-	hash = udp_app_hashkey(cp->vport);
-
-	spin_lock(&udp_app_lock);
-	list_for_each_entry(inc, &udp_apps[hash], p_list) {
-		if (inc->port == cp->vport) {
-			if (unlikely(!ip_vs_app_inc_get(inc)))
-				break;
-			spin_unlock(&udp_app_lock);
-
-			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
-				      "%s:%u to app %s on port %u\n",
-				      __func__,
-				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
-				      ntohs(cp->cport),
-				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
-				      ntohs(cp->vport),
-				      inc->name, ntohs(inc->port));
-
-			cp->app = inc;
-			if (inc->init_conn)
-				result = inc->init_conn(inc, cp);
-			goto out;
-		}
-	}
-	spin_unlock(&udp_app_lock);
-
-  out:
-	return result;
-}
-
-
-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
-	[IP_VS_UDP_S_NORMAL]		=	5*60*HZ,
-	[IP_VS_UDP_S_LAST]		=	2*HZ,
-};
-
-static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
-	[IP_VS_UDP_S_NORMAL]		=	"UDP",
-	[IP_VS_UDP_S_LAST]		=	"BUG!",
-};
-
-
-static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
-				       udp_state_name_table, sname, to);
-}
-
-static const char * udp_state_name(int state)
-{
-	if (state >= IP_VS_UDP_S_LAST)
-		return "ERR!";
-	return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
-}
-
-static int
-udp_state_transition(struct ip_vs_conn *cp, int direction,
-		     const struct sk_buff *skb,
-		     struct ip_vs_protocol *pp)
-{
-	cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
-	return 1;
-}
-
-static void udp_init(struct ip_vs_protocol *pp)
-{
-	IP_VS_INIT_HASH_TABLE(udp_apps);
-	pp->timeout_table = udp_timeouts;
-}
-
-static void udp_exit(struct ip_vs_protocol *pp)
-{
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_udp = {
-	.name =			"UDP",
-	.protocol =		IPPROTO_UDP,
-	.num_states =		IP_VS_UDP_S_LAST,
-	.dont_defrag =		0,
-	.init =			udp_init,
-	.exit =			udp_exit,
-	.conn_schedule =	udp_conn_schedule,
-	.conn_in_get =		udp_conn_in_get,
-	.conn_out_get =		udp_conn_out_get,
-	.snat_handler =		udp_snat_handler,
-	.dnat_handler =		udp_dnat_handler,
-	.csum_check =		udp_csum_check,
-	.state_transition =	udp_state_transition,
-	.state_name =		udp_state_name,
-	.register_app =		udp_register_app,
-	.unregister_app =	udp_unregister_app,
-	.app_conn_bind =	udp_app_conn_bind,
-	.debug_packet =		ip_vs_tcpudp_debug_packet,
-	.timeout_change =	NULL,
-	.set_state_timeout =	udp_set_state_timeout,
-};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
deleted file mode 100644
index a22195f68ac..00000000000
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * IPVS:        Round-Robin Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Fixes/Changes:
- *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
- *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
- *     Wensong Zhang            :     changed some comestics things for debugging
- *     Wensong Zhang            :     changed for the d-linked destination list
- *     Wensong Zhang            :     added the ip_vs_rr_update_svc
- *     Wensong Zhang            :     added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
-{
-	svc->sched_data = &svc->destinations;
-	return 0;
-}
-
-
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
-{
-	svc->sched_data = &svc->destinations;
-	return 0;
-}
-
-
-/*
- * Round-Robin Scheduling
- */
-static struct ip_vs_dest *
-ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct list_head *p, *q;
-	struct ip_vs_dest *dest;
-
-	IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
-
-	write_lock(&svc->sched_lock);
-	p = (struct list_head *)svc->sched_data;
-	p = p->next;
-	q = p;
-	do {
-		/* skip list head */
-		if (q == &svc->destinations) {
-			q = q->next;
-			continue;
-		}
-
-		dest = list_entry(q, struct ip_vs_dest, n_list);
-		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-		    atomic_read(&dest->weight) > 0)
-			/* HIT */
-			goto out;
-		q = q->next;
-	} while (q != p);
-	write_unlock(&svc->sched_lock);
-	return NULL;
-
-  out:
-	svc->sched_data = q;
-	write_unlock(&svc->sched_lock);
-	IP_VS_DBG_BUF(6, "RR: server %s:%u "
-		      "activeconns %d refcnt %d weight %d\n",
-		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
-		      atomic_read(&dest->activeconns),
-		      atomic_read(&dest->refcnt), atomic_read(&dest->weight));
-
-	return dest;
-}
-
-
-static struct ip_vs_scheduler ip_vs_rr_scheduler = {
-	.name =			"rr",			/* name */
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	1,
-#endif
-	.init_service =		ip_vs_rr_init_svc,
-	.update_service =	ip_vs_rr_update_svc,
-	.schedule =		ip_vs_rr_schedule,
-};
-
-static int __init ip_vs_rr_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
-}
-
-static void __exit ip_vs_rr_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
-}
-
-module_init(ip_vs_rr_init);
-module_exit(ip_vs_rr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
deleted file mode 100644
index a46ad9e3501..00000000000
--- a/net/ipv4/ipvs/ip_vs_sched.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the Netfilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <asm/string.h>
-#include <linux/kmod.h>
-#include <linux/sysctl.h>
-
-#include <net/ip_vs.h>
-
-/*
- *  IPVS scheduler list
- */
-static LIST_HEAD(ip_vs_schedulers);
-
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_sched_lock);
-
-
-/*
- *  Bind a service with a scheduler
- */
-int ip_vs_bind_scheduler(struct ip_vs_service *svc,
-			 struct ip_vs_scheduler *scheduler)
-{
-	int ret;
-
-	if (svc == NULL) {
-		IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
-		return -EINVAL;
-	}
-	if (scheduler == NULL) {
-		IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
-		return -EINVAL;
-	}
-
-	svc->scheduler = scheduler;
-
-	if (scheduler->init_service) {
-		ret = scheduler->init_service(svc);
-		if (ret) {
-			IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-
-/*
- *  Unbind a service with its scheduler
- */
-int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
-{
-	struct ip_vs_scheduler *sched;
-
-	if (svc == NULL) {
-		IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
-		return -EINVAL;
-	}
-
-	sched = svc->scheduler;
-	if (sched == NULL) {
-		IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
-		return -EINVAL;
-	}
-
-	if (sched->done_service) {
-		if (sched->done_service(svc) != 0) {
-			IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
-			return -EINVAL;
-		}
-	}
-
-	svc->scheduler = NULL;
-	return 0;
-}
-
-
-/*
- *  Get scheduler in the scheduler list by name
- */
-static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
-{
-	struct ip_vs_scheduler *sched;
-
-	IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
-		  sched_name);
-
-	read_lock_bh(&__ip_vs_sched_lock);
-
-	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
-		/*
-		 * Test and get the modules atomically
-		 */
-		if (sched->module && !try_module_get(sched->module)) {
-			/*
-			 * This scheduler is just deleted
-			 */
-			continue;
-		}
-		if (strcmp(sched_name, sched->name)==0) {
-			/* HIT */
-			read_unlock_bh(&__ip_vs_sched_lock);
-			return sched;
-		}
-		if (sched->module)
-			module_put(sched->module);
-	}
-
-	read_unlock_bh(&__ip_vs_sched_lock);
-	return NULL;
-}
-
-
-/*
- *  Lookup scheduler and try to load it if it doesn't exist
- */
-struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
-{
-	struct ip_vs_scheduler *sched;
-
-	/*
-	 *  Search for the scheduler by sched_name
-	 */
-	sched = ip_vs_sched_getbyname(sched_name);
-
-	/*
-	 *  If scheduler not found, load the module and search again
-	 */
-	if (sched == NULL) {
-		request_module("ip_vs_%s", sched_name);
-		sched = ip_vs_sched_getbyname(sched_name);
-	}
-
-	return sched;
-}
-
-void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
-{
-	if (scheduler->module)
-		module_put(scheduler->module);
-}
-
-
-/*
- *  Register a scheduler in the scheduler list
- */
-int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
-{
-	struct ip_vs_scheduler *sched;
-
-	if (!scheduler) {
-		IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
-		return -EINVAL;
-	}
-
-	if (!scheduler->name) {
-		IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
-		return -EINVAL;
-	}
-
-	/* increase the module use count */
-	ip_vs_use_count_inc();
-
-	write_lock_bh(&__ip_vs_sched_lock);
-
-	if (!list_empty(&scheduler->n_list)) {
-		write_unlock_bh(&__ip_vs_sched_lock);
-		ip_vs_use_count_dec();
-		IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
-			  "already linked\n", scheduler->name);
-		return -EINVAL;
-	}
-
-	/*
-	 *  Make sure that the scheduler with this name doesn't exist
-	 *  in the scheduler list.
-	 */
-	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
-		if (strcmp(scheduler->name, sched->name) == 0) {
-			write_unlock_bh(&__ip_vs_sched_lock);
-			ip_vs_use_count_dec();
-			IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
-					"already existed in the system\n",
-					scheduler->name);
-			return -EINVAL;
-		}
-	}
-	/*
-	 *	Add it into the d-linked scheduler list
-	 */
-	list_add(&scheduler->n_list, &ip_vs_schedulers);
-	write_unlock_bh(&__ip_vs_sched_lock);
-
-	IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
-
-	return 0;
-}
-
-
-/*
- *  Unregister a scheduler from the scheduler list
- */
-int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
-{
-	if (!scheduler) {
-		IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
-		return -EINVAL;
-	}
-
-	write_lock_bh(&__ip_vs_sched_lock);
-	if (list_empty(&scheduler->n_list)) {
-		write_unlock_bh(&__ip_vs_sched_lock);
-		IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
-			  "is not in the list. failed\n", scheduler->name);
-		return -EINVAL;
-	}
-
-	/*
-	 *	Remove it from the d-linked scheduler list
-	 */
-	list_del(&scheduler->n_list);
-	write_unlock_bh(&__ip_vs_sched_lock);
-
-	/* decrease the module use count */
-	ip_vs_use_count_dec();
-
-	IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
-
-	return 0;
-}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
deleted file mode 100644
index 7d2f22f04b8..00000000000
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * IPVS:        Shortest Expected Delay scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The SED algorithm attempts to minimize each job's expected delay until
- * completion. The expected delay that the job will experience is
- * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
- * jobs on the ith server and Ui is the fixed service rate (weight) of
- * the ith server. The SED algorithm adopts a greedy policy that each does
- * what is in its own best interest, i.e. to join the queue which would
- * minimize its expected delay of completion.
- *
- * See the following paper for more information:
- * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
- * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
- * pages 986-994, 1988.
- *
- * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
- *
- * The difference between SED and WLC is that SED includes the incoming
- * job in the cost function (the increment of 1). SED may outperform
- * WLC, while scheduling big jobs under larger heterogeneous systems
- * (the server weight varies a lot).
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
-{
-	/*
-	 * We only use the active connection number in the cost
-	 * calculation here.
-	 */
-	return atomic_read(&dest->activeconns) + 1;
-}
-
-
-/*
- *	Weighted Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest, *least;
-	unsigned int loh, doh;
-
-	IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
-
-	/*
-	 * We calculate the load of each dest server as follows:
-	 *	(server expected overhead) / dest->weight
-	 *
-	 * Remember -- no floats in kernel mode!!!
-	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
-	 *		  h1/w1 > h2/w2
-	 * if every weight is larger than zero.
-	 *
-	 * The server with weight=0 is quiesced and will not receive any
-	 * new connections.
-	 */
-
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-		    atomic_read(&dest->weight) > 0) {
-			least = dest;
-			loh = ip_vs_sed_dest_overhead(least);
-			goto nextstage;
-		}
-	}
-	return NULL;
-
-	/*
-	 *    Find the destination with the least load.
-	 */
-  nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-		doh = ip_vs_sed_dest_overhead(dest);
-		if (loh * atomic_read(&dest->weight) >
-		    doh * atomic_read(&least->weight)) {
-			least = dest;
-			loh = doh;
-		}
-	}
-
-	IP_VS_DBG_BUF(6, "SED: server %s:%u "
-		      "activeconns %d refcnt %d weight %d overhead %d\n",
-		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-		      atomic_read(&least->activeconns),
-		      atomic_read(&least->refcnt),
-		      atomic_read(&least->weight), loh);
-
-	return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_sed_scheduler =
-{
-	.name =			"sed",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	1,
-#endif
-	.schedule =		ip_vs_sed_schedule,
-};
-
-
-static int __init ip_vs_sed_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
-}
-
-static void __exit ip_vs_sed_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
-}
-
-module_init(ip_vs_sed_init);
-module_exit(ip_vs_sed_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
deleted file mode 100644
index 1d96de27fef..00000000000
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * IPVS:        Source Hashing scheduling module
- *
- * Authors:     Wensong Zhang <wensong@gnuchina.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The sh algorithm is to select server by the hash key of source IP
- * address. The pseudo code is as follows:
- *
- *       n <- servernode[src_ip];
- *       if (n is dead) OR
- *          (n is overloaded) or (n.weight <= 0) then
- *                 return NULL;
- *
- *       return n;
- *
- * Notes that servernode is a 256-bucket hash table that maps the hash
- * index derived from packet source IP address to the current server
- * array. If the sh scheduler is used in cache cluster, it is good to
- * combine it with cache_bypass feature. When the statically assigned
- * server is dead or overloaded, the load balancer can bypass the cache
- * server and send requests to the original server directly.
- *
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *      IPVS SH bucket
- */
-struct ip_vs_sh_bucket {
-	struct ip_vs_dest       *dest;          /* real server (cache) */
-};
-
-/*
- *     for IPVS SH entry hash table
- */
-#ifndef CONFIG_IP_VS_SH_TAB_BITS
-#define CONFIG_IP_VS_SH_TAB_BITS        8
-#endif
-#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
-#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
-#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
-
-
-/*
- *	Returns hash value for IPVS SH entry
- */
-static inline unsigned ip_vs_sh_hashkey(__be32 addr)
-{
-	return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
-}
-
-
-/*
- *      Get ip_vs_dest associated with supplied parameters.
- */
-static inline struct ip_vs_dest *
-ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
-{
-	return (tbl[ip_vs_sh_hashkey(addr)]).dest;
-}
-
-
-/*
- *      Assign all the hash buckets of the specified table with the service.
- */
-static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
-{
-	int i;
-	struct ip_vs_sh_bucket *b;
-	struct list_head *p;
-	struct ip_vs_dest *dest;
-
-	b = tbl;
-	p = &svc->destinations;
-	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
-		if (list_empty(p)) {
-			b->dest = NULL;
-		} else {
-			if (p == &svc->destinations)
-				p = p->next;
-
-			dest = list_entry(p, struct ip_vs_dest, n_list);
-			atomic_inc(&dest->refcnt);
-			b->dest = dest;
-
-			p = p->next;
-		}
-		b++;
-	}
-	return 0;
-}
-
-
-/*
- *      Flush all the hash buckets of the specified table.
- */
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
-{
-	int i;
-	struct ip_vs_sh_bucket *b;
-
-	b = tbl;
-	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
-		if (b->dest) {
-			atomic_dec(&b->dest->refcnt);
-			b->dest = NULL;
-		}
-		b++;
-	}
-}
-
-
-static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_sh_bucket *tbl;
-
-	/* allocate the SH table for this service */
-	tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
-		      GFP_ATOMIC);
-	if (tbl == NULL) {
-		IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
-		return -ENOMEM;
-	}
-	svc->sched_data = tbl;
-	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
-		  "current service\n",
-		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
-	/* assign the hash buckets with the updated service */
-	ip_vs_sh_assign(tbl, svc);
-
-	return 0;
-}
-
-
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
-	/* got to clean up hash buckets here */
-	ip_vs_sh_flush(tbl);
-
-	/* release the table itself */
-	kfree(svc->sched_data);
-	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
-		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
-	return 0;
-}
-
-
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
-	/* got to clean up hash buckets here */
-	ip_vs_sh_flush(tbl);
-
-	/* assign the hash buckets with the updated service */
-	ip_vs_sh_assign(tbl, svc);
-
-	return 0;
-}
-
-
-/*
- *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
-	return dest->flags & IP_VS_DEST_F_OVERLOAD;
-}
-
-
-/*
- *      Source Hashing scheduling
- */
-static struct ip_vs_dest *
-ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest;
-	struct ip_vs_sh_bucket *tbl;
-	struct iphdr *iph = ip_hdr(skb);
-
-	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
-
-	tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
-	dest = ip_vs_sh_get(tbl, iph->saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
-		return NULL;
-	}
-
-	IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
-		  "--> server %u.%u.%u.%u:%d\n",
-		  NIPQUAD(iph->saddr),
-		  NIPQUAD(dest->addr.ip),
-		  ntohs(dest->port));
-
-	return dest;
-}
-
-
-/*
- *      IPVS SH Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_sh_scheduler =
-{
-	.name =			"sh",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list	 =		LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	0,
-#endif
-	.init_service =		ip_vs_sh_init_svc,
-	.done_service =		ip_vs_sh_done_svc,
-	.update_service =	ip_vs_sh_update_svc,
-	.schedule =		ip_vs_sh_schedule,
-};
-
-
-static int __init ip_vs_sh_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
-}
-
-
-static void __exit ip_vs_sh_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
-}
-
-
-module_init(ip_vs_sh_init);
-module_exit(ip_vs_sh_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
deleted file mode 100644
index de5e7e118ee..00000000000
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ /dev/null
@@ -1,942 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the NetFilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * ip_vs_sync:  sync connection info from master load balancer to backups
- *              through multicast
- *
- * Changes:
- *	Alexandre Cassen	:	Added master & backup support at a time.
- *	Alexandre Cassen	:	Added SyncID support for incoming sync
- *					messages filtering.
- *	Justin Ossevoort	:	Fix endian problem on sync message size.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/inetdevice.h>
-#include <linux/net.h>
-#include <linux/completion.h>
-#include <linux/delay.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/igmp.h>                 /* for ip_mc_join_group */
-#include <linux/udp.h>
-#include <linux/err.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-#include <linux/kernel.h>
-
-#include <net/ip.h>
-#include <net/sock.h>
-
-#include <net/ip_vs.h>
-
-#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
-#define IP_VS_SYNC_PORT  8848          /* multicast port */
-
-
-/*
- *	IPVS sync connection entry
- */
-struct ip_vs_sync_conn {
-	__u8			reserved;
-
-	/* Protocol, addresses and port numbers */
-	__u8			protocol;       /* Which protocol (TCP/UDP) */
-	__be16			cport;
-	__be16                  vport;
-	__be16                  dport;
-	__be32                  caddr;          /* client address */
-	__be32                  vaddr;          /* virtual address */
-	__be32                  daddr;          /* destination address */
-
-	/* Flags and state transition */
-	__be16                  flags;          /* status flags */
-	__be16                  state;          /* state info */
-
-	/* The sequence options start here */
-};
-
-struct ip_vs_sync_conn_options {
-	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
-	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
-};
-
-struct ip_vs_sync_thread_data {
-	struct socket *sock;
-	char *buf;
-};
-
-#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
-#define FULL_CONN_SIZE  \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
-
-
-/*
-  The master mulitcasts messages to the backup load balancers in the
-  following format.
-
-       0                   1                   2                   3
-       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |  Count Conns  |    SyncID     |            Size               |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |                                                               |
-      |                    IPVS Sync Connection (1)                   |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |                            .                                  |
-      |                            .                                  |
-      |                            .                                  |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |                                                               |
-      |                    IPVS Sync Connection (n)                   |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*/
-
-#define SYNC_MESG_HEADER_LEN	4
-#define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-
-struct ip_vs_sync_mesg {
-	__u8                    nr_conns;
-	__u8                    syncid;
-	__u16                   size;
-
-	/* ip_vs_sync_conn entries start here */
-};
-
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
-
-struct ip_vs_sync_buff {
-	struct list_head        list;
-	unsigned long           firstuse;
-
-	/* pointers for the message data */
-	struct ip_vs_sync_mesg  *mesg;
-	unsigned char           *head;
-	unsigned char           *end;
-};
-
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff   *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
-/* multicast addr */
-static struct sockaddr_in mcast_addr = {
-	.sin_family		= AF_INET,
-	.sin_port		= __constant_htons(IP_VS_SYNC_PORT),
-	.sin_addr.s_addr	= __constant_htonl(IP_VS_SYNC_GROUP),
-};
-
-
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
-{
-	struct ip_vs_sync_buff *sb;
-
-	spin_lock_bh(&ip_vs_sync_lock);
-	if (list_empty(&ip_vs_sync_queue)) {
-		sb = NULL;
-	} else {
-		sb = list_entry(ip_vs_sync_queue.next,
-				struct ip_vs_sync_buff,
-				list);
-		list_del(&sb->list);
-	}
-	spin_unlock_bh(&ip_vs_sync_lock);
-
-	return sb;
-}
-
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
-{
-	struct ip_vs_sync_buff *sb;
-
-	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
-		return NULL;
-
-	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
-		kfree(sb);
-		return NULL;
-	}
-	sb->mesg->nr_conns = 0;
-	sb->mesg->syncid = ip_vs_master_syncid;
-	sb->mesg->size = 4;
-	sb->head = (unsigned char *)sb->mesg + 4;
-	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
-	sb->firstuse = jiffies;
-	return sb;
-}
-
-static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
-{
-	kfree(sb->mesg);
-	kfree(sb);
-}
-
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
-{
-	spin_lock(&ip_vs_sync_lock);
-	if (ip_vs_sync_state & IP_VS_STATE_MASTER)
-		list_add_tail(&sb->list, &ip_vs_sync_queue);
-	else
-		ip_vs_sync_buff_release(sb);
-	spin_unlock(&ip_vs_sync_lock);
-}
-
-/*
- *	Get the current sync buffer if it has been created for more
- *	than the specified time or the specified time is zero.
- */
-static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
-{
-	struct ip_vs_sync_buff *sb;
-
-	spin_lock_bh(&curr_sb_lock);
-	if (curr_sb && (time == 0 ||
-			time_before(jiffies - curr_sb->firstuse, time))) {
-		sb = curr_sb;
-		curr_sb = NULL;
-	} else
-		sb = NULL;
-	spin_unlock_bh(&curr_sb_lock);
-	return sb;
-}
-
-
-/*
- *      Add an ip_vs_conn information into the current sync_buff.
- *      Called by ip_vs_in.
- */
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
-{
-	struct ip_vs_sync_mesg *m;
-	struct ip_vs_sync_conn *s;
-	int len;
-
-	spin_lock(&curr_sb_lock);
-	if (!curr_sb) {
-		if (!(curr_sb=ip_vs_sync_buff_create())) {
-			spin_unlock(&curr_sb_lock);
-			IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
-			return;
-		}
-	}
-
-	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
-		SIMPLE_CONN_SIZE;
-	m = curr_sb->mesg;
-	s = (struct ip_vs_sync_conn *)curr_sb->head;
-
-	/* copy members */
-	s->protocol = cp->protocol;
-	s->cport = cp->cport;
-	s->vport = cp->vport;
-	s->dport = cp->dport;
-	s->caddr = cp->caddr.ip;
-	s->vaddr = cp->vaddr.ip;
-	s->daddr = cp->daddr.ip;
-	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
-	s->state = htons(cp->state);
-	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
-		struct ip_vs_sync_conn_options *opt =
-			(struct ip_vs_sync_conn_options *)&s[1];
-		memcpy(opt, &cp->in_seq, sizeof(*opt));
-	}
-
-	m->nr_conns++;
-	m->size += len;
-	curr_sb->head += len;
-
-	/* check if there is a space for next one */
-	if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
-		sb_queue_tail(curr_sb);
-		curr_sb = NULL;
-	}
-	spin_unlock(&curr_sb_lock);
-
-	/* synchronize its controller if it has */
-	if (cp->control)
-		ip_vs_sync_conn(cp->control);
-}
-
-
-/*
- *      Process received multicast message and create the corresponding
- *      ip_vs_conn entries.
- */
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
-{
-	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
-	struct ip_vs_sync_conn *s;
-	struct ip_vs_sync_conn_options *opt;
-	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp;
-	struct ip_vs_dest *dest;
-	char *p;
-	int i;
-
-	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
-		IP_VS_ERR_RL("sync message header too short\n");
-		return;
-	}
-
-	/* Convert size back to host byte order */
-	m->size = ntohs(m->size);
-
-	if (buflen != m->size) {
-		IP_VS_ERR_RL("bogus sync message size\n");
-		return;
-	}
-
-	/* SyncID sanity check */
-	if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
-		IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
-			  m->syncid);
-		return;
-	}
-
-	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
-	for (i=0; i<m->nr_conns; i++) {
-		unsigned flags, state;
-
-		if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
-			IP_VS_ERR_RL("bogus conn in sync message\n");
-			return;
-		}
-		s = (struct ip_vs_sync_conn *) p;
-		flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
-		flags &= ~IP_VS_CONN_F_HASHED;
-		if (flags & IP_VS_CONN_F_SEQ_MASK) {
-			opt = (struct ip_vs_sync_conn_options *)&s[1];
-			p += FULL_CONN_SIZE;
-			if (p > buffer+buflen) {
-				IP_VS_ERR_RL("bogus conn options in sync message\n");
-				return;
-			}
-		} else {
-			opt = NULL;
-			p += SIMPLE_CONN_SIZE;
-		}
-
-		state = ntohs(s->state);
-		if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
-			pp = ip_vs_proto_get(s->protocol);
-			if (!pp) {
-				IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
-					s->protocol);
-				continue;
-			}
-			if (state >= pp->num_states) {
-				IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
-					pp->name, state);
-				continue;
-			}
-		} else {
-			/* protocol in templates is not used for state/timeout */
-			pp = NULL;
-			if (state > 0) {
-				IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
-					state);
-				state = 0;
-			}
-		}
-
-		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
-					       (union nf_inet_addr *)&s->caddr,
-					       s->cport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport);
-		else
-			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
-					     (union nf_inet_addr *)&s->caddr,
-					     s->cport,
-					     (union nf_inet_addr *)&s->vaddr,
-					     s->vport);
-		if (!cp) {
-			/*
-			 * Find the appropriate destination for the connection.
-			 * If it is not found the connection will remain unbound
-			 * but still handled.
-			 */
-			dest = ip_vs_find_dest(AF_INET,
-					       (union nf_inet_addr *)&s->daddr,
-					       s->dport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport,
-					       s->protocol);
-			/*  Set the approprite ativity flag */
-			if (s->protocol == IPPROTO_TCP) {
-				if (state != IP_VS_TCP_S_ESTABLISHED)
-					flags |= IP_VS_CONN_F_INACTIVE;
-				else
-					flags &= ~IP_VS_CONN_F_INACTIVE;
-			}
-			cp = ip_vs_conn_new(AF_INET, s->protocol,
-					    (union nf_inet_addr *)&s->caddr,
-					    s->cport,
-					    (union nf_inet_addr *)&s->vaddr,
-					    s->vport,
-					    (union nf_inet_addr *)&s->daddr,
-					    s->dport,
-					    flags, dest);
-			if (dest)
-				atomic_dec(&dest->refcnt);
-			if (!cp) {
-				IP_VS_ERR("ip_vs_conn_new failed\n");
-				return;
-			}
-		} else if (!cp->dest) {
-			dest = ip_vs_try_bind_dest(cp);
-			if (dest)
-				atomic_dec(&dest->refcnt);
-		} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
-			   (cp->state != state)) {
-			/* update active/inactive flag for the connection */
-			dest = cp->dest;
-			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-				(state != IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_dec(&dest->activeconns);
-				atomic_inc(&dest->inactconns);
-				cp->flags |= IP_VS_CONN_F_INACTIVE;
-			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
-				(state == IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_inc(&dest->activeconns);
-				atomic_dec(&dest->inactconns);
-				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
-			}
-		}
-
-		if (opt)
-			memcpy(&cp->in_seq, opt, sizeof(*opt));
-		atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
-		cp->state = state;
-		cp->old_state = cp->state;
-		/*
-		 * We can not recover the right timeout for templates
-		 * in all cases, we can not find the right fwmark
-		 * virtual service. If needed, we can do it for
-		 * non-fwmark persistent services.
-		 */
-		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
-			cp->timeout = pp->timeout_table[state];
-		else
-			cp->timeout = (3*60*HZ);
-		ip_vs_conn_put(cp);
-	}
-}
-
-
-/*
- *      Setup loopback of outgoing multicasts on a sending socket
- */
-static void set_mcast_loop(struct sock *sk, u_char loop)
-{
-	struct inet_sock *inet = inet_sk(sk);
-
-	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
-	lock_sock(sk);
-	inet->mc_loop = loop ? 1 : 0;
-	release_sock(sk);
-}
-
-/*
- *      Specify TTL for outgoing multicasts on a sending socket
- */
-static void set_mcast_ttl(struct sock *sk, u_char ttl)
-{
-	struct inet_sock *inet = inet_sk(sk);
-
-	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
-	lock_sock(sk);
-	inet->mc_ttl = ttl;
-	release_sock(sk);
-}
-
-/*
- *      Specifiy default interface for outgoing multicasts
- */
-static int set_mcast_if(struct sock *sk, char *ifname)
-{
-	struct net_device *dev;
-	struct inet_sock *inet = inet_sk(sk);
-
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
-		return -ENODEV;
-
-	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
-		return -EINVAL;
-
-	lock_sock(sk);
-	inet->mc_index = dev->ifindex;
-	/*  inet->mc_addr  = 0; */
-	release_sock(sk);
-
-	return 0;
-}
-
-
-/*
- *	Set the maximum length of sync message according to the
- *	specified interface's MTU.
- */
-static int set_sync_mesg_maxlen(int sync_state)
-{
-	struct net_device *dev;
-	int num;
-
-	if (sync_state == IP_VS_STATE_MASTER) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
-			return -ENODEV;
-
-		num = (dev->mtu - sizeof(struct iphdr) -
-		       sizeof(struct udphdr) -
-		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-		sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
-			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
-		IP_VS_DBG(7, "setting the maximum length of sync sending "
-			  "message %d.\n", sync_send_mesg_maxlen);
-	} else if (sync_state == IP_VS_STATE_BACKUP) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
-			return -ENODEV;
-
-		sync_recv_mesg_maxlen = dev->mtu -
-			sizeof(struct iphdr) - sizeof(struct udphdr);
-		IP_VS_DBG(7, "setting the maximum length of sync receiving "
-			  "message %d.\n", sync_recv_mesg_maxlen);
-	}
-
-	return 0;
-}
-
-
-/*
- *      Join a multicast group.
- *      the group is specified by a class D multicast address 224.0.0.0/8
- *      in the in_addr structure passed in as a parameter.
- */
-static int
-join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
-{
-	struct ip_mreqn mreq;
-	struct net_device *dev;
-	int ret;
-
-	memset(&mreq, 0, sizeof(mreq));
-	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
-
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
-		return -ENODEV;
-	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
-		return -EINVAL;
-
-	mreq.imr_ifindex = dev->ifindex;
-
-	lock_sock(sk);
-	ret = ip_mc_join_group(sk, &mreq);
-	release_sock(sk);
-
-	return ret;
-}
-
-
-static int bind_mcastif_addr(struct socket *sock, char *ifname)
-{
-	struct net_device *dev;
-	__be32 addr;
-	struct sockaddr_in sin;
-
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
-		return -ENODEV;
-
-	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
-	if (!addr)
-		IP_VS_ERR("You probably need to specify IP address on "
-			  "multicast interface.\n");
-
-	IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
-		  ifname, NIPQUAD(addr));
-
-	/* Now bind the socket with the address of multicast interface */
-	sin.sin_family	     = AF_INET;
-	sin.sin_addr.s_addr  = addr;
-	sin.sin_port         = 0;
-
-	return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
-}
-
-/*
- *      Set up sending multicast socket over UDP
- */
-static struct socket * make_send_sock(void)
-{
-	struct socket *sock;
-	int result;
-
-	/* First create a socket */
-	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
-	if (result < 0) {
-		IP_VS_ERR("Error during creation of socket; terminating\n");
-		return ERR_PTR(result);
-	}
-
-	result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
-	if (result < 0) {
-		IP_VS_ERR("Error setting outbound mcast interface\n");
-		goto error;
-	}
-
-	set_mcast_loop(sock->sk, 0);
-	set_mcast_ttl(sock->sk, 1);
-
-	result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
-	if (result < 0) {
-		IP_VS_ERR("Error binding address of the mcast interface\n");
-		goto error;
-	}
-
-	result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
-			sizeof(struct sockaddr), 0);
-	if (result < 0) {
-		IP_VS_ERR("Error connecting to the multicast addr\n");
-		goto error;
-	}
-
-	return sock;
-
-  error:
-	sock_release(sock);
-	return ERR_PTR(result);
-}
-
-
-/*
- *      Set up receiving multicast socket over UDP
- */
-static struct socket * make_receive_sock(void)
-{
-	struct socket *sock;
-	int result;
-
-	/* First create a socket */
-	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
-	if (result < 0) {
-		IP_VS_ERR("Error during creation of socket; terminating\n");
-		return ERR_PTR(result);
-	}
-
-	/* it is equivalent to the REUSEADDR option in user-space */
-	sock->sk->sk_reuse = 1;
-
-	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
-			sizeof(struct sockaddr));
-	if (result < 0) {
-		IP_VS_ERR("Error binding to the multicast addr\n");
-		goto error;
-	}
-
-	/* join the multicast group */
-	result = join_mcast_group(sock->sk,
-			(struct in_addr *) &mcast_addr.sin_addr,
-			ip_vs_backup_mcast_ifn);
-	if (result < 0) {
-		IP_VS_ERR("Error joining to the multicast group\n");
-		goto error;
-	}
-
-	return sock;
-
-  error:
-	sock_release(sock);
-	return ERR_PTR(result);
-}
-
-
-static int
-ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
-{
-	struct msghdr	msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
-	struct kvec	iov;
-	int		len;
-
-	EnterFunction(7);
-	iov.iov_base     = (void *)buffer;
-	iov.iov_len      = length;
-
-	len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
-
-	LeaveFunction(7);
-	return len;
-}
-
-static void
-ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
-{
-	int msize;
-
-	msize = msg->size;
-
-	/* Put size in network byte order */
-	msg->size = htons(msg->size);
-
-	if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
-		IP_VS_ERR("ip_vs_send_async error\n");
-}
-
-static int
-ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
-{
-	struct msghdr		msg = {NULL,};
-	struct kvec		iov;
-	int			len;
-
-	EnterFunction(7);
-
-	/* Receive a packet */
-	iov.iov_base     = buffer;
-	iov.iov_len      = (size_t)buflen;
-
-	len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
-
-	if (len < 0)
-		return -1;
-
-	LeaveFunction(7);
-	return len;
-}
-
-
-static int sync_thread_master(void *data)
-{
-	struct ip_vs_sync_thread_data *tinfo = data;
-	struct ip_vs_sync_buff *sb;
-
-	IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
-		   "syncid = %d\n",
-		   ip_vs_master_mcast_ifn, ip_vs_master_syncid);
-
-	while (!kthread_should_stop()) {
-		while ((sb = sb_dequeue())) {
-			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
-			ip_vs_sync_buff_release(sb);
-		}
-
-		/* check if entries stay in curr_sb for 2 seconds */
-		sb = get_curr_sync_buff(2 * HZ);
-		if (sb) {
-			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
-			ip_vs_sync_buff_release(sb);
-		}
-
-		schedule_timeout_interruptible(HZ);
-	}
-
-	/* clean up the sync_buff queue */
-	while ((sb=sb_dequeue())) {
-		ip_vs_sync_buff_release(sb);
-	}
-
-	/* clean up the current sync_buff */
-	if ((sb = get_curr_sync_buff(0))) {
-		ip_vs_sync_buff_release(sb);
-	}
-
-	/* release the sending multicast socket */
-	sock_release(tinfo->sock);
-	kfree(tinfo);
-
-	return 0;
-}
-
-
-static int sync_thread_backup(void *data)
-{
-	struct ip_vs_sync_thread_data *tinfo = data;
-	int len;
-
-	IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
-		   "syncid = %d\n",
-		   ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
-
-	while (!kthread_should_stop()) {
-		wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
-			 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
-			 || kthread_should_stop());
-
-		/* do we have data now? */
-		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
-			len = ip_vs_receive(tinfo->sock, tinfo->buf,
-					sync_recv_mesg_maxlen);
-			if (len <= 0) {
-				IP_VS_ERR("receiving message error\n");
-				break;
-			}
-
-			/* disable bottom half, because it accesses the data
-			   shared by softirq while getting/creating conns */
-			local_bh_disable();
-			ip_vs_process_message(tinfo->buf, len);
-			local_bh_enable();
-		}
-	}
-
-	/* release the sending multicast socket */
-	sock_release(tinfo->sock);
-	kfree(tinfo->buf);
-	kfree(tinfo);
-
-	return 0;
-}
-
-
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
-{
-	struct ip_vs_sync_thread_data *tinfo;
-	struct task_struct **realtask, *task;
-	struct socket *sock;
-	char *name, *buf = NULL;
-	int (*threadfn)(void *data);
-	int result = -ENOMEM;
-
-	IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
-	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
-		  sizeof(struct ip_vs_sync_conn));
-
-	if (state == IP_VS_STATE_MASTER) {
-		if (sync_master_thread)
-			return -EEXIST;
-
-		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_master_mcast_ifn));
-		ip_vs_master_syncid = syncid;
-		realtask = &sync_master_thread;
-		name = "ipvs_syncmaster";
-		threadfn = sync_thread_master;
-		sock = make_send_sock();
-	} else if (state == IP_VS_STATE_BACKUP) {
-		if (sync_backup_thread)
-			return -EEXIST;
-
-		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_backup_mcast_ifn));
-		ip_vs_backup_syncid = syncid;
-		realtask = &sync_backup_thread;
-		name = "ipvs_syncbackup";
-		threadfn = sync_thread_backup;
-		sock = make_receive_sock();
-	} else {
-		return -EINVAL;
-	}
-
-	if (IS_ERR(sock)) {
-		result = PTR_ERR(sock);
-		goto out;
-	}
-
-	set_sync_mesg_maxlen(state);
-	if (state == IP_VS_STATE_BACKUP) {
-		buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
-		if (!buf)
-			goto outsocket;
-	}
-
-	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
-	if (!tinfo)
-		goto outbuf;
-
-	tinfo->sock = sock;
-	tinfo->buf = buf;
-
-	task = kthread_run(threadfn, tinfo, name);
-	if (IS_ERR(task)) {
-		result = PTR_ERR(task);
-		goto outtinfo;
-	}
-
-	/* mark as active */
-	*realtask = task;
-	ip_vs_sync_state |= state;
-
-	/* increase the module use count */
-	ip_vs_use_count_inc();
-
-	return 0;
-
-outtinfo:
-	kfree(tinfo);
-outbuf:
-	kfree(buf);
-outsocket:
-	sock_release(sock);
-out:
-	return result;
-}
-
-
-int stop_sync_thread(int state)
-{
-	IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
-
-	if (state == IP_VS_STATE_MASTER) {
-		if (!sync_master_thread)
-			return -ESRCH;
-
-		IP_VS_INFO("stopping master sync thread %d ...\n",
-			   task_pid_nr(sync_master_thread));
-
-		/*
-		 * The lock synchronizes with sb_queue_tail(), so that we don't
-		 * add sync buffers to the queue, when we are already in
-		 * progress of stopping the master sync daemon.
-		 */
-
-		spin_lock_bh(&ip_vs_sync_lock);
-		ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
-		spin_unlock_bh(&ip_vs_sync_lock);
-		kthread_stop(sync_master_thread);
-		sync_master_thread = NULL;
-	} else if (state == IP_VS_STATE_BACKUP) {
-		if (!sync_backup_thread)
-			return -ESRCH;
-
-		IP_VS_INFO("stopping backup sync thread %d ...\n",
-			   task_pid_nr(sync_backup_thread));
-
-		ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
-		kthread_stop(sync_backup_thread);
-		sync_backup_thread = NULL;
-	} else {
-		return -EINVAL;
-	}
-
-	/* decrease the module use count */
-	ip_vs_use_count_dec();
-
-	return 0;
-}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
deleted file mode 100644
index 8c596e71259..00000000000
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * IPVS:        Weighted Least-Connection Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
- *     Wensong Zhang            :     changed to use the inactconns in scheduling
- *     Wensong Zhang            :     changed some comestics things for debugging
- *     Wensong Zhang            :     changed for the d-linked destination list
- *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
- *     Wensong Zhang            :     added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
-{
-	/*
-	 * We think the overhead of processing active connections is 256
-	 * times higher than that of inactive connections in average. (This
-	 * 256 times might not be accurate, we will change it later) We
-	 * use the following formula to estimate the overhead now:
-	 *		  dest->activeconns*256 + dest->inactconns
-	 */
-	return (atomic_read(&dest->activeconns) << 8) +
-		atomic_read(&dest->inactconns);
-}
-
-
-/*
- *	Weighted Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest, *least;
-	unsigned int loh, doh;
-
-	IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
-
-	/*
-	 * We calculate the load of each dest server as follows:
-	 *		  (dest overhead) / dest->weight
-	 *
-	 * Remember -- no floats in kernel mode!!!
-	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
-	 *		  h1/w1 > h2/w2
-	 * if every weight is larger than zero.
-	 *
-	 * The server with weight=0 is quiesced and will not receive any
-	 * new connections.
-	 */
-
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-		    atomic_read(&dest->weight) > 0) {
-			least = dest;
-			loh = ip_vs_wlc_dest_overhead(least);
-			goto nextstage;
-		}
-	}
-	return NULL;
-
-	/*
-	 *    Find the destination with the least load.
-	 */
-  nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-			continue;
-		doh = ip_vs_wlc_dest_overhead(dest);
-		if (loh * atomic_read(&dest->weight) >
-		    doh * atomic_read(&least->weight)) {
-			least = dest;
-			loh = doh;
-		}
-	}
-
-	IP_VS_DBG_BUF(6, "WLC: server %s:%u "
-		      "activeconns %d refcnt %d weight %d overhead %d\n",
-		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-		      atomic_read(&least->activeconns),
-		      atomic_read(&least->refcnt),
-		      atomic_read(&least->weight), loh);
-
-	return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_wlc_scheduler =
-{
-	.name =			"wlc",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	1,
-#endif
-	.schedule =		ip_vs_wlc_schedule,
-};
-
-
-static int __init ip_vs_wlc_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
-}
-
-static void __exit ip_vs_wlc_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
-}
-
-module_init(ip_vs_wlc_init);
-module_exit(ip_vs_wlc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
deleted file mode 100644
index 7ea92fed50b..00000000000
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * IPVS:        Weighted Round-Robin Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
- *     Wensong Zhang            :     changed some comestics things for debugging
- *     Wensong Zhang            :     changed for the d-linked destination list
- *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
- *     Julian Anastasov         :     fixed the bug of returning destination
- *                                    with weight 0 when all weights are zero
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/net.h>
-
-#include <net/ip_vs.h>
-
-/*
- * current destination pointer for weighted round-robin scheduling
- */
-struct ip_vs_wrr_mark {
-	struct list_head *cl;	/* current list head */
-	int cw;			/* current weight */
-	int mw;			/* maximum weight */
-	int di;			/* decreasing interval */
-};
-
-
-/*
- *    Get the gcd of server weights
- */
-static int gcd(int a, int b)
-{
-	int c;
-
-	while ((c = a % b)) {
-		a = b;
-		b = c;
-	}
-	return b;
-}
-
-static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
-{
-	struct ip_vs_dest *dest;
-	int weight;
-	int g = 0;
-
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		weight = atomic_read(&dest->weight);
-		if (weight > 0) {
-			if (g > 0)
-				g = gcd(weight, g);
-			else
-				g = weight;
-		}
-	}
-	return g ? g : 1;
-}
-
-
-/*
- *    Get the maximum weight of the service destinations.
- */
-static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
-{
-	struct ip_vs_dest *dest;
-	int weight = 0;
-
-	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if (atomic_read(&dest->weight) > weight)
-			weight = atomic_read(&dest->weight);
-	}
-
-	return weight;
-}
-
-
-static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_wrr_mark *mark;
-
-	/*
-	 *    Allocate the mark variable for WRR scheduling
-	 */
-	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
-	if (mark == NULL) {
-		IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
-		return -ENOMEM;
-	}
-	mark->cl = &svc->destinations;
-	mark->cw = 0;
-	mark->mw = ip_vs_wrr_max_weight(svc);
-	mark->di = ip_vs_wrr_gcd_weight(svc);
-	svc->sched_data = mark;
-
-	return 0;
-}
-
-
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
-{
-	/*
-	 *    Release the mark variable
-	 */
-	kfree(svc->sched_data);
-
-	return 0;
-}
-
-
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_wrr_mark *mark = svc->sched_data;
-
-	mark->cl = &svc->destinations;
-	mark->mw = ip_vs_wrr_max_weight(svc);
-	mark->di = ip_vs_wrr_gcd_weight(svc);
-	if (mark->cw > mark->mw)
-		mark->cw = 0;
-	return 0;
-}
-
-
-/*
- *    Weighted Round-Robin Scheduling
- */
-static struct ip_vs_dest *
-ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest;
-	struct ip_vs_wrr_mark *mark = svc->sched_data;
-	struct list_head *p;
-
-	IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
-
-	/*
-	 * This loop will always terminate, because mark->cw in (0, max_weight]
-	 * and at least one server has its weight equal to max_weight.
-	 */
-	write_lock(&svc->sched_lock);
-	p = mark->cl;
-	while (1) {
-		if (mark->cl == &svc->destinations) {
-			/* it is at the head of the destination list */
-
-			if (mark->cl == mark->cl->next) {
-				/* no dest entry */
-				dest = NULL;
-				goto out;
-			}
-
-			mark->cl = svc->destinations.next;
-			mark->cw -= mark->di;
-			if (mark->cw <= 0) {
-				mark->cw = mark->mw;
-				/*
-				 * Still zero, which means no available servers.
-				 */
-				if (mark->cw == 0) {
-					mark->cl = &svc->destinations;
-					IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
-						   "no available servers\n");
-					dest = NULL;
-					goto out;
-				}
-			}
-		} else
-			mark->cl = mark->cl->next;
-
-		if (mark->cl != &svc->destinations) {
-			/* not at the head of the list */
-			dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
-			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-			    atomic_read(&dest->weight) >= mark->cw) {
-				/* got it */
-				break;
-			}
-		}
-
-		if (mark->cl == p && mark->cw == mark->di) {
-			/* back to the start, and no dest is found.
-			   It is only possible when all dests are OVERLOADED */
-			dest = NULL;
-			goto out;
-		}
-	}
-
-	IP_VS_DBG_BUF(6, "WRR: server %s:%u "
-		      "activeconns %d refcnt %d weight %d\n",
-		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
-		      atomic_read(&dest->activeconns),
-		      atomic_read(&dest->refcnt),
-		      atomic_read(&dest->weight));
-
-  out:
-	write_unlock(&svc->sched_lock);
-	return dest;
-}
-
-
-static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
-	.name =			"wrr",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	1,
-#endif
-	.init_service =		ip_vs_wrr_init_svc,
-	.done_service =		ip_vs_wrr_done_svc,
-	.update_service =	ip_vs_wrr_update_svc,
-	.schedule =		ip_vs_wrr_schedule,
-};
-
-static int __init ip_vs_wrr_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
-}
-
-static void __exit ip_vs_wrr_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
-}
-
-module_init(ip_vs_wrr_init);
-module_exit(ip_vs_wrr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
deleted file mode 100644
index 02ddc2b3ce2..00000000000
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ /dev/null
@@ -1,1004 +0,0 @@
-/*
- * ip_vs_xmit.c: various packet transmitters for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/tcp.h>                  /* for tcphdr */
-#include <net/ip.h>
-#include <net/tcp.h>                    /* for csum_tcpudp_magic */
-#include <net/udp.h>
-#include <net/icmp.h>                   /* for icmp_send */
-#include <net/route.h>                  /* for ip_route_output */
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <linux/icmpv6.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *      Destination cache to speed up outgoing route lookup
- */
-static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
-{
-	struct dst_entry *old_dst;
-
-	old_dst = dest->dst_cache;
-	dest->dst_cache = dst;
-	dest->dst_rtos = rtos;
-	dst_release(old_dst);
-}
-
-static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
-{
-	struct dst_entry *dst = dest->dst_cache;
-
-	if (!dst)
-		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
-		dest->dst_cache = NULL;
-		dst_release(dst);
-		return NULL;
-	}
-	dst_hold(dst);
-	return dst;
-}
-
-static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
-{
-	struct rtable *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
-
-	if (dest) {
-		spin_lock(&dest->dst_lock);
-		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip4_u = {
-						.daddr = dest->addr.ip,
-						.saddr = 0,
-						.tos = rtos, } },
-			};
-
-			if (ip_route_output_key(&init_net, &rt, &fl)) {
-				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip_route_output error, "
-					     "dest: %u.%u.%u.%u\n",
-					     NIPQUAD(dest->addr.ip));
-				return NULL;
-			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
-			IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
-				  NIPQUAD(dest->addr.ip),
-				  atomic_read(&rt->u.dst.__refcnt), rtos);
-		}
-		spin_unlock(&dest->dst_lock);
-	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip4_u = {
-					.daddr = cp->daddr.ip,
-					.saddr = 0,
-					.tos = rtos, } },
-		};
-
-		if (ip_route_output_key(&init_net, &rt, &fl)) {
-			IP_VS_DBG_RL("ip_route_output error, dest: "
-				     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
-			return NULL;
-		}
-	}
-
-	return rt;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
-{
-	struct rt6_info *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
-
-	if (dest) {
-		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
-		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
-
-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
-				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, "
-					     "dest: " NIP6_FMT "\n",
-					     NIP6(dest->addr.in6));
-				return NULL;
-			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
-			IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
-				  NIP6(dest->addr.in6),
-				  atomic_read(&rt->u.dst.__refcnt));
-		}
-		spin_unlock(&dest->dst_lock);
-	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: "
-				     NIP6_FMT "\n", NIP6(cp->daddr.in6));
-			return NULL;
-		}
-	}
-
-	return rt;
-}
-#endif
-
-
-/*
- *	Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
-{
-	struct dst_entry *old_dst;
-
-	old_dst = dest->dst_cache;
-	dest->dst_cache = NULL;
-	dst_release(old_dst);
-}
-
-#define IP_VS_XMIT(pf, skb, rt)				\
-do {							\
-	(skb)->ipvs_property = 1;			\
-	skb_forward_csum(skb);				\
-	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
-		(rt)->u.dst.dev, dst_output);		\
-} while (0)
-
-
-/*
- *      NULL transmitter (do nothing except return NF_ACCEPT)
- */
-int
-ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		struct ip_vs_protocol *pp)
-{
-	/* we do not touch skb and do not need pskb ptr */
-	return NF_ACCEPT;
-}
-
-
-/*
- *      Bypass transmitter
- *      Let packets bypass the destination when the destination is not
- *      available, it may be only used in transparent cache cluster.
- */
-int
-ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp)
-{
-	struct rtable *rt;			/* Route to the other host */
-	struct iphdr  *iph = ip_hdr(skb);
-	u8     tos = iph->tos;
-	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip4_u = {
-				.daddr = iph->daddr,
-				.saddr = 0,
-				.tos = RT_TOS(tos), } },
-	};
-
-	EnterFunction(10);
-
-	if (ip_route_output_key(&init_net, &rt, &fl)) {
-		IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
-			     "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
-		goto tx_error_icmp;
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
-		goto tx_error;
-	}
-
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
-		ip_rt_put(rt);
-		return NF_STOLEN;
-	}
-	ip_send_check(ip_hdr(skb));
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET, skb, rt);
-
-	LeaveFunction(10);
-	return NF_STOLEN;
-
- tx_error_icmp:
-	dst_link_failure(skb);
- tx_error:
-	kfree_skb(skb);
-	LeaveFunction(10);
-	return NF_STOLEN;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		     struct ip_vs_protocol *pp)
-{
-	struct rt6_info *rt;			/* Route to the other host */
-	struct ipv6hdr  *iph = ipv6_hdr(skb);
-	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};
-
-	EnterFunction(10);
-
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
-			     "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
-		goto tx_error_icmp;
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if (skb->len > mtu) {
-		dst_release(&rt->u.dst);
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
-		goto tx_error;
-	}
-
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (unlikely(skb == NULL)) {
-		dst_release(&rt->u.dst);
-		return NF_STOLEN;
-	}
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET6, skb, rt);
-
-	LeaveFunction(10);
-	return NF_STOLEN;
-
- tx_error_icmp:
-	dst_link_failure(skb);
- tx_error:
-	kfree_skb(skb);
-	LeaveFunction(10);
-	return NF_STOLEN;
-}
-#endif
-
-/*
- *      NAT transmitter (only for outside-to-inside nat forwarding)
- *      Not used for related ICMP
- */
-int
-ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-	       struct ip_vs_protocol *pp)
-{
-	struct rtable *rt;		/* Route to the other host */
-	int mtu;
-	struct iphdr *iph = ip_hdr(skb);
-
-	EnterFunction(10);
-
-	/* check if it is a connection of no-client-port */
-	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
-		__be16 _pt, *p;
-		p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
-		if (p == NULL)
-			goto tx_error;
-		ip_vs_conn_fill_cport(cp, *p);
-		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
-	}
-
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
-		goto tx_error;
-	}
-
-	/* copy-on-write the packet before mangling it */
-	if (!skb_make_writable(skb, sizeof(struct iphdr)))
-		goto tx_error_put;
-
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-		goto tx_error_put;
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/* mangle the packet */
-	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-		goto tx_error;
-	ip_hdr(skb)->daddr = cp->daddr.ip;
-	ip_send_check(ip_hdr(skb));
-
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
-
-	/* FIXME: when application helper enlarges the packet and the length
-	   is larger than the MTU of outgoing device, there will be still
-	   MTU problem. */
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET, skb, rt);
-
-	LeaveFunction(10);
-	return NF_STOLEN;
-
-  tx_error_icmp:
-	dst_link_failure(skb);
-  tx_error:
-	LeaveFunction(10);
-	kfree_skb(skb);
-	return NF_STOLEN;
-  tx_error_put:
-	ip_rt_put(rt);
-	goto tx_error;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp)
-{
-	struct rt6_info *rt;		/* Route to the other host */
-	int mtu;
-
-	EnterFunction(10);
-
-	/* check if it is a connection of no-client-port */
-	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
-		__be16 _pt, *p;
-		p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
-				       sizeof(_pt), &_pt);
-		if (p == NULL)
-			goto tx_error;
-		ip_vs_conn_fill_cport(cp, *p);
-		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
-	}
-
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if (skb->len > mtu) {
-		dst_release(&rt->u.dst);
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-				 "ip_vs_nat_xmit_v6(): frag needed for");
-		goto tx_error;
-	}
-
-	/* copy-on-write the packet before mangling it */
-	if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
-		goto tx_error_put;
-
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-		goto tx_error_put;
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/* mangle the packet */
-	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-		goto tx_error;
-	ipv6_hdr(skb)->daddr = cp->daddr.in6;
-
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
-
-	/* FIXME: when application helper enlarges the packet and the length
-	   is larger than the MTU of outgoing device, there will be still
-	   MTU problem. */
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET6, skb, rt);
-
-	LeaveFunction(10);
-	return NF_STOLEN;
-
-tx_error_icmp:
-	dst_link_failure(skb);
-tx_error:
-	LeaveFunction(10);
-	kfree_skb(skb);
-	return NF_STOLEN;
-tx_error_put:
-	dst_release(&rt->u.dst);
-	goto tx_error;
-}
-#endif
-
-
-/*
- *   IP Tunneling transmitter
- *
- *   This function encapsulates the packet in a new IP packet, its
- *   destination will be set to cp->daddr. Most code of this function
- *   is taken from ipip.c.
- *
- *   It is used in VS/TUN cluster. The load balancer selects a real
- *   server from a cluster based on a scheduling algorithm,
- *   encapsulates the request packet and forwards it to the selected
- *   server. For example, all real servers are configured with
- *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
- *   the encapsulated packet, it will decapsulate the packet, processe
- *   the request and return the response packets directly to the client
- *   without passing the load balancer. This can greatly increase the
- *   scalability of virtual server.
- *
- *   Used for ANY protocol
- */
-int
-ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp)
-{
-	struct rtable *rt;			/* Route to the other host */
-	struct net_device *tdev;		/* Device to other host */
-	struct iphdr  *old_iph = ip_hdr(skb);
-	u8     tos = old_iph->tos;
-	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
-	struct iphdr  *iph;			/* Our new IP header */
-	unsigned int max_headroom;		/* The extra header space needed */
-	int    mtu;
-
-	EnterFunction(10);
-
-	if (skb->protocol != htons(ETH_P_IP)) {
-		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
-			     "ETH_P_IP: %d, skb protocol: %d\n",
-			     htons(ETH_P_IP), skb->protocol);
-		goto tx_error;
-	}
-
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
-		goto tx_error_icmp;
-
-	tdev = rt->u.dst.dev;
-
-	mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
-	if (mtu < 68) {
-		ip_rt_put(rt);
-		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
-		goto tx_error;
-	}
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
-
-	df |= (old_iph->frag_off & htons(IP_DF));
-
-	if ((old_iph->frag_off & htons(IP_DF))
-	    && mtu < ntohs(old_iph->tot_len)) {
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		ip_rt_put(rt);
-		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
-		goto tx_error;
-	}
-
-	/*
-	 * Okay, now see if we can stuff it in the buffer as-is.
-	 */
-	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
-
-	if (skb_headroom(skb) < max_headroom
-	    || skb_cloned(skb) || skb_shared(skb)) {
-		struct sk_buff *new_skb =
-			skb_realloc_headroom(skb, max_headroom);
-		if (!new_skb) {
-			ip_rt_put(rt);
-			kfree_skb(skb);
-			IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
-			return NF_STOLEN;
-		}
-		kfree_skb(skb);
-		skb = new_skb;
-		old_iph = ip_hdr(skb);
-	}
-
-	skb->transport_header = old_transport_header;
-
-	/* fix old IP header checksum */
-	ip_send_check(old_iph);
-
-	skb_push(skb, sizeof(struct iphdr));
-	skb_reset_network_header(skb);
-	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/*
-	 *	Push down and install the IPIP header.
-	 */
-	iph			=	ip_hdr(skb);
-	iph->version		=	4;
-	iph->ihl		=	sizeof(struct iphdr)>>2;
-	iph->frag_off		=	df;
-	iph->protocol		=	IPPROTO_IPIP;
-	iph->tos		=	tos;
-	iph->daddr		=	rt->rt_dst;
-	iph->saddr		=	rt->rt_src;
-	iph->ttl		=	old_iph->ttl;
-	ip_select_ident(iph, &rt->u.dst, NULL);
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	ip_local_out(skb);
-
-	LeaveFunction(10);
-
-	return NF_STOLEN;
-
-  tx_error_icmp:
-	dst_link_failure(skb);
-  tx_error:
-	kfree_skb(skb);
-	LeaveFunction(10);
-	return NF_STOLEN;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		     struct ip_vs_protocol *pp)
-{
-	struct rt6_info *rt;		/* Route to the other host */
-	struct net_device *tdev;	/* Device to other host */
-	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
-	struct ipv6hdr  *iph;		/* Our new IP header */
-	unsigned int max_headroom;	/* The extra header space needed */
-	int    mtu;
-
-	EnterFunction(10);
-
-	if (skb->protocol != htons(ETH_P_IPV6)) {
-		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
-			     "ETH_P_IPV6: %d, skb protocol: %d\n",
-			     htons(ETH_P_IPV6), skb->protocol);
-		goto tx_error;
-	}
-
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
-		goto tx_error_icmp;
-
-	tdev = rt->u.dst.dev;
-
-	mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
-		dst_release(&rt->u.dst);
-		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
-		goto tx_error;
-	}
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
-
-	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-		dst_release(&rt->u.dst);
-		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
-		goto tx_error;
-	}
-
-	/*
-	 * Okay, now see if we can stuff it in the buffer as-is.
-	 */
-	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
-
-	if (skb_headroom(skb) < max_headroom
-	    || skb_cloned(skb) || skb_shared(skb)) {
-		struct sk_buff *new_skb =
-			skb_realloc_headroom(skb, max_headroom);
-		if (!new_skb) {
-			dst_release(&rt->u.dst);
-			kfree_skb(skb);
-			IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
-			return NF_STOLEN;
-		}
-		kfree_skb(skb);
-		skb = new_skb;
-		old_iph = ipv6_hdr(skb);
-	}
-
-	skb->transport_header = old_transport_header;
-
-	skb_push(skb, sizeof(struct ipv6hdr));
-	skb_reset_network_header(skb);
-	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/*
-	 *	Push down and install the IPIP header.
-	 */
-	iph			=	ipv6_hdr(skb);
-	iph->version		=	6;
-	iph->nexthdr		=	IPPROTO_IPV6;
-	iph->payload_len	=	old_iph->payload_len + sizeof(old_iph);
-	iph->priority		=	old_iph->priority;
-	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
-	iph->hop_limit		=	old_iph->hop_limit;
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	ip6_local_out(skb);
-
-	LeaveFunction(10);
-
-	return NF_STOLEN;
-
-tx_error_icmp:
-	dst_link_failure(skb);
-tx_error:
-	kfree_skb(skb);
-	LeaveFunction(10);
-	return NF_STOLEN;
-}
-#endif
-
-
-/*
- *      Direct Routing transmitter
- *      Used for ANY protocol
- */
-int
-ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-	      struct ip_vs_protocol *pp)
-{
-	struct rtable *rt;			/* Route to the other host */
-	struct iphdr  *iph = ip_hdr(skb);
-	int    mtu;
-
-	EnterFunction(10);
-
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		ip_rt_put(rt);
-		IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
-		goto tx_error;
-	}
-
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
-		ip_rt_put(rt);
-		return NF_STOLEN;
-	}
-	ip_send_check(ip_hdr(skb));
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET, skb, rt);
-
-	LeaveFunction(10);
-	return NF_STOLEN;
-
-  tx_error_icmp:
-	dst_link_failure(skb);
-  tx_error:
-	kfree_skb(skb);
-	LeaveFunction(10);
-	return NF_STOLEN;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		 struct ip_vs_protocol *pp)
-{
-	struct rt6_info *rt;			/* Route to the other host */
-	int    mtu;
-
-	EnterFunction(10);
-
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if (skb->len > mtu) {
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-		dst_release(&rt->u.dst);
-		IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
-		goto tx_error;
-	}
-
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (unlikely(skb == NULL)) {
-		dst_release(&rt->u.dst);
-		return NF_STOLEN;
-	}
-
-	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET6, skb, rt);
-
-	LeaveFunction(10);
-	return NF_STOLEN;
-
-tx_error_icmp:
-	dst_link_failure(skb);
-tx_error:
-	kfree_skb(skb);
-	LeaveFunction(10);
-	return NF_STOLEN;
-}
-#endif
-
-
-/*
- *	ICMP packet transmitter
- *	called by the ip_vs_in_icmp
- */
-int
-ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		struct ip_vs_protocol *pp, int offset)
-{
-	struct rtable	*rt;	/* Route to the other host */
-	int mtu;
-	int rc;
-
-	EnterFunction(10);
-
-	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
-	   forwarded directly here, because there is no need to
-	   translate address/port back */
-	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
-		if (cp->packet_xmit)
-			rc = cp->packet_xmit(skb, cp, pp);
-		else
-			rc = NF_ACCEPT;
-		/* do not touch skb anymore */
-		atomic_inc(&cp->in_pkts);
-		goto out;
-	}
-
-	/*
-	 * mangle and send the packet here (only for VS/NAT)
-	 */
-
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
-		goto tx_error;
-	}
-
-	/* copy-on-write the packet before mangling it */
-	if (!skb_make_writable(skb, offset))
-		goto tx_error_put;
-
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-		goto tx_error_put;
-
-	/* drop the old route when skb is not shared */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	ip_vs_nat_icmp(skb, pp, cp, 0);
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET, skb, rt);
-
-	rc = NF_STOLEN;
-	goto out;
-
-  tx_error_icmp:
-	dst_link_failure(skb);
-  tx_error:
-	dev_kfree_skb(skb);
-	rc = NF_STOLEN;
-  out:
-	LeaveFunction(10);
-	return rc;
-  tx_error_put:
-	ip_rt_put(rt);
-	goto tx_error;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		struct ip_vs_protocol *pp, int offset)
-{
-	struct rt6_info	*rt;	/* Route to the other host */
-	int mtu;
-	int rc;
-
-	EnterFunction(10);
-
-	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
-	   forwarded directly here, because there is no need to
-	   translate address/port back */
-	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
-		if (cp->packet_xmit)
-			rc = cp->packet_xmit(skb, cp, pp);
-		else
-			rc = NF_ACCEPT;
-		/* do not touch skb anymore */
-		atomic_inc(&cp->in_pkts);
-		goto out;
-	}
-
-	/*
-	 * mangle and send the packet here (only for VS/NAT)
-	 */
-
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
-	if (skb->len > mtu) {
-		dst_release(&rt->u.dst);
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-		IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
-		goto tx_error;
-	}
-
-	/* copy-on-write the packet before mangling it */
-	if (!skb_make_writable(skb, offset))
-		goto tx_error_put;
-
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-		goto tx_error_put;
-
-	/* drop the old route when skb is not shared */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
-
-	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
-
-	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
-
-	IP_VS_XMIT(PF_INET6, skb, rt);
-
-	rc = NF_STOLEN;
-	goto out;
-
-tx_error_icmp:
-	dst_link_failure(skb);
-tx_error:
-	dev_kfree_skb(skb);
-	rc = NF_STOLEN;
-out:
-	LeaveFunction(10);
-	return rc;
-tx_error_put:
-	dst_release(&rt->u.dst);
-	goto tx_error;
-}
-#endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ee898e74808..73f9378e1bf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -838,3 +838,5 @@ config NETFILTER_XT_MATCH_HASHLIMIT
 
 endmenu
 
+source "net/netfilter/ipvs/Kconfig"
+
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3bd2cc556ae..cf75055be83 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -83,3 +83,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+
+# IPVS
+obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
new file mode 100644
index 00000000000..de6004de80b
--- /dev/null
+++ b/net/netfilter/ipvs/Kconfig
@@ -0,0 +1,239 @@
+#
+# IP Virtual Server configuration
+#
+menuconfig IP_VS
+	tristate "IP virtual server support (EXPERIMENTAL)"
+	depends on NETFILTER
+	---help---
+	  IP Virtual Server support will let you build a high-performance
+	  virtual server based on cluster of two or more real servers. This
+	  option must be enabled for at least one of the clustered computers
+	  that will take care of intercepting incoming connections to a
+	  single IP address and scheduling them to real servers.
+
+	  Three request dispatching techniques are implemented, they are
+	  virtual server via NAT, virtual server via tunneling and virtual
+	  server via direct routing. The several scheduling algorithms can
+	  be used to choose which server the connection is directed to,
+	  thus load balancing can be achieved among the servers.  For more
+	  information and its administration program, please visit the
+	  following URL: <http://www.linuxvirtualserver.org/>.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+if IP_VS
+
+config	IP_VS_IPV6
+	bool "IPv6 support for IPVS (DANGEROUS)"
+	depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
+	---help---
+	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+
+	  Say N if unsure.
+
+config	IP_VS_DEBUG
+	bool "IP virtual server debugging"
+	---help---
+	  Say Y here if you want to get additional messages useful in
+	  debugging the IP virtual server code. You can change the debug
+	  level in /proc/sys/net/ipv4/vs/debug_level
+
+config	IP_VS_TAB_BITS
+	int "IPVS connection table size (the Nth power of 2)"
+	range 8 20
+	default 12
+	---help---
+	  The IPVS connection hash table uses the chaining scheme to handle
+	  hash collisions. Using a big IPVS connection hash table will greatly
+	  reduce conflicts when there are hundreds of thousands of connections
+	  in the hash table.
+
+	  Note the table size must be power of 2. The table size will be the
+	  value of 2 to the your input number power. The number to choose is
+	  from 8 to 20, the default number is 12, which means the table size
+	  is 4096. Don't input the number too small, otherwise you will lose
+	  performance on it. You can adapt the table size yourself, according
+	  to your virtual server application. It is good to set the table size
+	  not far less than the number of connections per second multiplying
+	  average lasting time of connection in the table.  For example, your
+	  virtual server gets 200 connections per second, the connection lasts
+	  for 200 seconds in average in the connection table, the table size
+	  should be not far less than 200x200, it is good to set the table
+	  size 32768 (2**15).
+
+	  Another note that each connection occupies 128 bytes effectively and
+	  each hash entry uses 8 bytes, so you can estimate how much memory is
+	  needed for your box.
+
+comment "IPVS transport protocol load balancing support"
+
+config	IP_VS_PROTO_TCP
+	bool "TCP load balancing support"
+	---help---
+	  This option enables support for load balancing TCP transport
+	  protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_UDP
+	bool "UDP load balancing support"
+	---help---
+	  This option enables support for load balancing UDP transport
+	  protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_AH_ESP
+	bool
+	depends on UNDEFINED
+
+config	IP_VS_PROTO_ESP
+	bool "ESP load balancing support"
+	select IP_VS_PROTO_AH_ESP
+	---help---
+	  This option enables support for load balancing ESP (Encapsulation
+	  Security Payload) transport protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_AH
+	bool "AH load balancing support"
+	select IP_VS_PROTO_AH_ESP
+	---help---
+	  This option enables support for load balancing AH (Authentication
+	  Header) transport protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+
+config	IP_VS_RR
+	tristate "round-robin scheduling"
+	---help---
+	  The robin-robin scheduling algorithm simply directs network
+	  connections to different real servers in a round-robin manner.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+ 
+config	IP_VS_WRR
+        tristate "weighted round-robin scheduling" 
+	---help---
+	  The weighted robin-robin scheduling algorithm directs network
+	  connections to different real servers based on server weights
+	  in a round-robin manner. Servers with higher weights receive
+	  new connections first than those with less weights, and servers
+	  with higher weights get more connections than those with less
+	  weights and servers with equal weights get equal connections.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_LC
+        tristate "least-connection scheduling"
+	---help---
+	  The least-connection scheduling algorithm directs network
+	  connections to the server with the least number of active 
+	  connections.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_WLC
+        tristate "weighted least-connection scheduling"
+	---help---
+	  The weighted least-connection scheduling algorithm directs network
+	  connections to the server with the least active connections
+	  normalized by the server weight.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_LBLC
+	tristate "locality-based least-connection scheduling"
+	---help---
+	  The locality-based least-connection scheduling algorithm is for
+	  destination IP load balancing. It is usually used in cache cluster.
+	  This algorithm usually directs packet destined for an IP address to
+	  its server if the server is alive and under load. If the server is
+	  overloaded (its active connection numbers is larger than its weight)
+	  and there is a server in its half load, then allocate the weighted
+	  least-connection server to this IP address.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config  IP_VS_LBLCR
+	tristate "locality-based least-connection with replication scheduling"
+	---help---
+	  The locality-based least-connection with replication scheduling
+	  algorithm is also for destination IP load balancing. It is 
+	  usually used in cache cluster. It differs from the LBLC scheduling
+	  as follows: the load balancer maintains mappings from a target
+	  to a set of server nodes that can serve the target. Requests for
+	  a target are assigned to the least-connection node in the target's
+	  server set. If all the node in the server set are over loaded,
+	  it picks up a least-connection node in the cluster and adds it
+	  in the sever set for the target. If the server set has not been
+	  modified for the specified time, the most loaded node is removed
+	  from the server set, in order to avoid high degree of replication.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_DH
+	tristate "destination hashing scheduling"
+	---help---
+	  The destination hashing scheduling algorithm assigns network
+	  connections to the servers through looking up a statically assigned
+	  hash table by their destination IP addresses.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_SH
+	tristate "source hashing scheduling"
+	---help---
+	  The source hashing scheduling algorithm assigns network
+	  connections to the servers through looking up a statically assigned
+	  hash table by their source IP addresses.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_SED
+	tristate "shortest expected delay scheduling"
+	---help---
+	  The shortest expected delay scheduling algorithm assigns network
+	  connections to the server with the shortest expected delay. The 
+	  expected delay that the job will experience is (Ci + 1) / Ui if 
+	  sent to the ith server, in which Ci is the number of connections
+	  on the ith server and Ui is the fixed service rate (weight)
+	  of the ith server.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_NQ
+	tristate "never queue scheduling"
+	---help---
+	  The never queue scheduling algorithm adopts a two-speed model.
+	  When there is an idle server available, the job will be sent to
+	  the idle server, instead of waiting for a fast one. When there
+	  is no idle server available, the job will be sent to the server
+	  that minimize its expected delay (The Shortest Expected Delay
+	  scheduling algorithm).
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+comment 'IPVS application helper'
+
+config	IP_VS_FTP
+  	tristate "FTP protocol helper"
+        depends on IP_VS_PROTO_TCP
+	---help---
+	  FTP is a protocol that transfers IP address and/or port number in
+	  the payload. In the virtual server via Network Address Translation,
+	  the IP address and port number of real servers cannot be sent to
+	  clients in ftp connections directly, so FTP protocol helper is
+	  required for tracking the connection and mangling it back to that of
+	  virtual service.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
new file mode 100644
index 00000000000..73a46fe1fe4
--- /dev/null
+++ b/net/netfilter/ipvs/Makefile
@@ -0,0 +1,33 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
+
+ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
+		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
+		ip_vs_est.o ip_vs_proto.o 				   \
+		$(ip_vs_proto-objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
new file mode 100644
index 00000000000..201b8ea3020
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -0,0 +1,622 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ *		IP_MASQ_APP application masquerading module
+ *
+ * Author:	Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+/* ipvs application list head */
+static LIST_HEAD(ip_vs_app_list);
+static DEFINE_MUTEX(__ip_vs_app_mutex);
+
+
+/*
+ *	Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+	return try_module_get(app->module);
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+	module_put(app->module);
+}
+
+
+/*
+ *	Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+	struct ip_vs_protocol *pp;
+	struct ip_vs_app *inc;
+	int ret;
+
+	if (!(pp = ip_vs_proto_get(proto)))
+		return -EPROTONOSUPPORT;
+
+	if (!pp->unregister_app)
+		return -EOPNOTSUPP;
+
+	inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
+	if (!inc)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&inc->p_list);
+	INIT_LIST_HEAD(&inc->incs_list);
+	inc->app = app;
+	inc->port = htons(port);
+	atomic_set(&inc->usecnt, 0);
+
+	if (app->timeouts) {
+		inc->timeout_table =
+			ip_vs_create_timeout_table(app->timeouts,
+						   app->timeouts_size);
+		if (!inc->timeout_table) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	ret = pp->register_app(inc);
+	if (ret)
+		goto out;
+
+	list_add(&inc->a_list, &app->incs_list);
+	IP_VS_DBG(9, "%s application %s:%u registered\n",
+		  pp->name, inc->name, inc->port);
+
+	return 0;
+
+  out:
+	kfree(inc->timeout_table);
+	kfree(inc);
+	return ret;
+}
+
+
+/*
+ *	Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct ip_vs_app *inc)
+{
+	struct ip_vs_protocol *pp;
+
+	if (!(pp = ip_vs_proto_get(inc->protocol)))
+		return;
+
+	if (pp->unregister_app)
+		pp->unregister_app(inc);
+
+	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+		  pp->name, inc->name, inc->port);
+
+	list_del(&inc->a_list);
+
+	kfree(inc->timeout_table);
+	kfree(inc);
+}
+
+
+/*
+ *	Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+	int result;
+
+	atomic_inc(&inc->usecnt);
+	if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+		atomic_dec(&inc->usecnt);
+	return result;
+}
+
+
+/*
+ *	Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+	ip_vs_app_put(inc->app);
+	atomic_dec(&inc->usecnt);
+}
+
+
+/*
+ *	Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+	int result;
+
+	mutex_lock(&__ip_vs_app_mutex);
+
+	result = ip_vs_app_inc_new(app, proto, port);
+
+	mutex_unlock(&__ip_vs_app_mutex);
+
+	return result;
+}
+
+
+/*
+ *	ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct ip_vs_app *app)
+{
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	mutex_lock(&__ip_vs_app_mutex);
+
+	list_add(&app->a_list, &ip_vs_app_list);
+
+	mutex_unlock(&__ip_vs_app_mutex);
+
+	return 0;
+}
+
+
+/*
+ *	ip_vs_app unregistration routine
+ *	We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct ip_vs_app *app)
+{
+	struct ip_vs_app *inc, *nxt;
+
+	mutex_lock(&__ip_vs_app_mutex);
+
+	list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
+		ip_vs_app_inc_release(inc);
+	}
+
+	list_del(&app->a_list);
+
+	mutex_unlock(&__ip_vs_app_mutex);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+}
+
+
+/*
+ *	Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+{
+	return pp->app_conn_bind(cp);
+}
+
+
+/*
+ *	Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+	struct ip_vs_app *inc = cp->app;
+
+	if (!inc)
+		return;
+
+	if (inc->unbind_conn)
+		inc->unbind_conn(inc, cp);
+	if (inc->done_conn)
+		inc->done_conn(inc, cp);
+	ip_vs_app_inc_put(inc);
+	cp->app = NULL;
+}
+
+
+/*
+ *	Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+	__u32 seq = ntohl(th->seq);
+
+	/*
+	 *	Adjust seq with delta-offset for all packets after
+	 *	the most recent resized pkt seq and with previous_delta offset
+	 *	for all packets	before most recent resized pkt seq.
+	 */
+	if (vseq->delta || vseq->previous_delta) {
+		if(after(seq, vseq->init_seq)) {
+			th->seq = htonl(seq + vseq->delta);
+			IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
+				  vseq->delta);
+		} else {
+			th->seq = htonl(seq + vseq->previous_delta);
+			IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
+				  "(%d) to seq\n", vseq->previous_delta);
+		}
+	}
+}
+
+
+/*
+ *	Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+	__u32 ack_seq = ntohl(th->ack_seq);
+
+	/*
+	 * Adjust ack_seq with delta-offset for
+	 * the packets AFTER most recent resized pkt has caused a shift
+	 * for packets before most recent resized pkt, use previous_delta
+	 */
+	if (vseq->delta || vseq->previous_delta) {
+		/* since ack_seq is the number of octet that is expected
+		   to receive next, so compare it with init_seq+delta */
+		if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+			th->ack_seq = htonl(ack_seq - vseq->delta);
+			IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
+				  "(%d) from ack_seq\n", vseq->delta);
+
+		} else {
+			th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+			IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
+				  "previous_delta (%d) from ack_seq\n",
+				  vseq->previous_delta);
+		}
+	}
+}
+
+
+/*
+ *	Updates ip_vs_seq if pkt has been resized
+ *	Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+				 unsigned flag, __u32 seq, int diff)
+{
+	/* spinlock is to keep updating cp->flags atomic */
+	spin_lock(&cp->lock);
+	if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+		vseq->previous_delta = vseq->delta;
+		vseq->delta += diff;
+		vseq->init_seq = seq;
+		cp->flags |= flag;
+	}
+	spin_unlock(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+				  struct ip_vs_app *app)
+{
+	int diff;
+	const unsigned int tcp_offset = ip_hdrlen(skb);
+	struct tcphdr *th;
+	__u32 seq;
+
+	if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+		return 0;
+
+	th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+	/*
+	 *	Remember seq number in case this pkt gets resized
+	 */
+	seq = ntohl(th->seq);
+
+	/*
+	 *	Fix seq stuff if flagged as so.
+	 */
+	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+		vs_fix_seq(&cp->out_seq, th);
+	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+		vs_fix_ack_seq(&cp->in_seq, th);
+
+	/*
+	 *	Call private output hook function
+	 */
+	if (app->pkt_out == NULL)
+		return 1;
+
+	if (!app->pkt_out(app, cp, skb, &diff))
+		return 0;
+
+	/*
+	 *	Update ip_vs seq stuff if len has changed.
+	 */
+	if (diff != 0)
+		vs_seq_update(cp, &cp->out_seq,
+			      IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+	return 1;
+}
+
+/*
+ *	Output pkt hook. Will call bound ip_vs_app specific function
+ *	called by ipvs packet handler, assumes previously checked cp!=NULL
+ *	returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_app *app;
+
+	/*
+	 *	check if application module is bound to
+	 *	this ip_vs_conn.
+	 */
+	if ((app = cp->app) == NULL)
+		return 1;
+
+	/* TCP is complicated */
+	if (cp->protocol == IPPROTO_TCP)
+		return app_tcp_pkt_out(cp, skb, app);
+
+	/*
+	 *	Call private output hook function
+	 */
+	if (app->pkt_out == NULL)
+		return 1;
+
+	return app->pkt_out(app, cp, skb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+				 struct ip_vs_app *app)
+{
+	int diff;
+	const unsigned int tcp_offset = ip_hdrlen(skb);
+	struct tcphdr *th;
+	__u32 seq;
+
+	if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+		return 0;
+
+	th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+	/*
+	 *	Remember seq number in case this pkt gets resized
+	 */
+	seq = ntohl(th->seq);
+
+	/*
+	 *	Fix seq stuff if flagged as so.
+	 */
+	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+		vs_fix_seq(&cp->in_seq, th);
+	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+		vs_fix_ack_seq(&cp->out_seq, th);
+
+	/*
+	 *	Call private input hook function
+	 */
+	if (app->pkt_in == NULL)
+		return 1;
+
+	if (!app->pkt_in(app, cp, skb, &diff))
+		return 0;
+
+	/*
+	 *	Update ip_vs seq stuff if len has changed.
+	 */
+	if (diff != 0)
+		vs_seq_update(cp, &cp->in_seq,
+			      IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+	return 1;
+}
+
+/*
+ *	Input pkt hook. Will call bound ip_vs_app specific function
+ *	called by ipvs packet handler, assumes previously checked cp!=NULL.
+ *	returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_app *app;
+
+	/*
+	 *	check if application module is bound to
+	 *	this ip_vs_conn.
+	 */
+	if ((app = cp->app) == NULL)
+		return 1;
+
+	/* TCP is complicated */
+	if (cp->protocol == IPPROTO_TCP)
+		return app_tcp_pkt_in(cp, skb, app);
+
+	/*
+	 *	Call private input hook function
+	 */
+	if (app->pkt_in == NULL)
+		return 1;
+
+	return app->pkt_in(app, cp, skb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ *	/proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+{
+	struct ip_vs_app *app, *inc;
+
+	list_for_each_entry(app, &ip_vs_app_list, a_list) {
+		list_for_each_entry(inc, &app->incs_list, a_list) {
+			if (pos-- == 0)
+				return inc;
+		}
+	}
+	return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	mutex_lock(&__ip_vs_app_mutex);
+
+	return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_vs_app *inc, *app;
+	struct list_head *e;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_app_idx(0);
+
+	inc = v;
+	app = inc->app;
+
+	if ((e = inc->a_list.next) != &app->incs_list)
+		return list_entry(e, struct ip_vs_app, a_list);
+
+	/* go on to next application */
+	for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+		app = list_entry(e, struct ip_vs_app, a_list);
+		list_for_each_entry(inc, &app->incs_list, a_list) {
+			return inc;
+		}
+	}
+	return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+	mutex_unlock(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "prot port    usecnt name\n");
+	else {
+		const struct ip_vs_app *inc = v;
+
+		seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
+			   ip_vs_proto_name(inc->protocol),
+			   ntohs(inc->port),
+			   atomic_read(&inc->usecnt),
+			   inc->name);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_app_seq_ops = {
+	.start = ip_vs_app_seq_start,
+	.next  = ip_vs_app_seq_next,
+	.stop  = ip_vs_app_seq_stop,
+	.show  = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ip_vs_app_seq_ops);
+}
+
+static const struct file_operations ip_vs_app_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ip_vs_app_open,
+	.read	 = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+#endif
+
+
+/*
+ *	Replace a segment of data with a new segment
+ */
+int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
+		      char *o_buf, int o_len, char *n_buf, int n_len)
+{
+	int diff;
+	int o_offset;
+	int o_left;
+
+	EnterFunction(9);
+
+	diff = n_len - o_len;
+	o_offset = o_buf - (char *)skb->data;
+	/* The length of left data after o_buf+o_len in the skb data */
+	o_left = skb->len - (o_offset + o_len);
+
+	if (diff <= 0) {
+		memmove(o_buf + n_len, o_buf + o_len, o_left);
+		memcpy(o_buf, n_buf, n_len);
+		skb_trim(skb, skb->len + diff);
+	} else if (diff <= skb_tailroom(skb)) {
+		skb_put(skb, diff);
+		memmove(o_buf + n_len, o_buf + o_len, o_left);
+		memcpy(o_buf, n_buf, n_len);
+	} else {
+		if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
+			return -ENOMEM;
+		skb_put(skb, diff);
+		memmove(skb->data + o_offset + n_len,
+			skb->data + o_offset + o_len, o_left);
+		skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
+	}
+
+	/* must update the iph total length here */
+	ip_hdr(skb)->tot_len = htons(skb->len);
+
+	LeaveFunction(9);
+	return 0;
+}
+
+
+int __init ip_vs_app_init(void)
+{
+	/* we will replace it with proc_net_ipvs_create() soon */
+	proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
+	return 0;
+}
+
+
+void ip_vs_app_cleanup(void)
+{
+	proc_net_remove(&init_net, "ip_vs_app");
+}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
new file mode 100644
index 00000000000..9a24332fbed
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -0,0 +1,1110 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>		/* for proc_net_* */
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/net_namespace.h>
+#include <net/ip_vs.h>
+
+
+/*
+ *  Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+
+/*  SLAB cache for IPVS connections */
+static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
+
+/*  counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+
+/*  counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+
+/*
+ *  Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS  4
+#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+	rwlock_t	l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_read_lock(unsigned key)
+{
+	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock(unsigned key)
+{
+	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock(unsigned key)
+{
+	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock(unsigned key)
+{
+	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_lock_bh(unsigned key)
+{
+	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock_bh(unsigned key)
+{
+	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock_bh(unsigned key)
+{
+	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned key)
+{
+	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ *	Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+				       const union nf_inet_addr *addr,
+				       __be16 port)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+				    (__force u32)port, proto, ip_vs_conn_rnd)
+			& IP_VS_CONN_TAB_MASK;
+#endif
+	return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+			    ip_vs_conn_rnd)
+		& IP_VS_CONN_TAB_MASK;
+}
+
+
+/*
+ *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ *	returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+	int ret;
+
+	/* Hash by protocol, client address and port */
+	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+
+	ct_write_lock(hash);
+
+	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+		cp->flags |= IP_VS_CONN_F_HASHED;
+		atomic_inc(&cp->refcnt);
+		ret = 1;
+	} else {
+		IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+			  "called from %p\n", __builtin_return_address(0));
+		ret = 0;
+	}
+
+	ct_write_unlock(hash);
+
+	return ret;
+}
+
+
+/*
+ *	UNhashes ip_vs_conn from ip_vs_conn_tab.
+ *	returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+	int ret;
+
+	/* unhash it and decrease its reference counter */
+	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+
+	ct_write_lock(hash);
+
+	if (cp->flags & IP_VS_CONN_F_HASHED) {
+		list_del(&cp->c_list);
+		cp->flags &= ~IP_VS_CONN_F_HASHED;
+		atomic_dec(&cp->refcnt);
+		ret = 1;
+	} else
+		ret = 0;
+
+	ct_write_unlock(hash);
+
+	return ret;
+}
+
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from OUTside-to-INside.
+ *	s_addr, s_port: pkt source address (foreign host)
+ *	d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp;
+
+	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+
+	ct_read_lock(hash);
+
+	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (cp->af == af &&
+		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
+		    s_port == cp->cport && d_port == cp->vport &&
+		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+		    protocol == cp->protocol) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ct_read_unlock(hash);
+			return cp;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+	struct ip_vs_conn *cp;
+
+	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
+					 d_port);
+
+	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      cp ? "hit" : "not hit");
+
+	return cp;
+}
+
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp;
+
+	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+
+	ct_read_lock(hash);
+
+	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (cp->af == af &&
+		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
+		    s_port == cp->cport && d_port == cp->vport &&
+		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
+		    protocol == cp->protocol) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			goto out;
+		}
+	}
+	cp = NULL;
+
+  out:
+	ct_read_unlock(hash);
+
+	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      cp ? "hit" : "not hit");
+
+	return cp;
+}
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from inside-to-OUTside.
+ *	s_addr, s_port: pkt source address (inside host)
+ *	d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp, *ret=NULL;
+
+	/*
+	 *	Check for "full" addressed entries
+	 */
+	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+
+	ct_read_lock(hash);
+
+	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (cp->af == af &&
+		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
+		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
+		    d_port == cp->cport && s_port == cp->dport &&
+		    protocol == cp->protocol) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ret = cp;
+			break;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ret ? "hit" : "not hit");
+
+	return ret;
+}
+
+
+/*
+ *      Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+	/* reset it expire in its timeout */
+	mod_timer(&cp->timer, jiffies+cp->timeout);
+
+	__ip_vs_conn_put(cp);
+}
+
+
+/*
+ *	Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
+{
+	if (ip_vs_conn_unhash(cp)) {
+		spin_lock(&cp->lock);
+		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+			atomic_dec(&ip_vs_conn_no_cport_cnt);
+			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+			cp->cport = cport;
+		}
+		spin_unlock(&cp->lock);
+
+		/* hash on new dport */
+		ip_vs_conn_hash(cp);
+	}
+}
+
+
+/*
+ *	Bind a connection entry with the corresponding packet_xmit.
+ *	Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+	switch (IP_VS_FWD_METHOD(cp)) {
+	case IP_VS_CONN_F_MASQ:
+		cp->packet_xmit = ip_vs_nat_xmit;
+		break;
+
+	case IP_VS_CONN_F_TUNNEL:
+		cp->packet_xmit = ip_vs_tunnel_xmit;
+		break;
+
+	case IP_VS_CONN_F_DROUTE:
+		cp->packet_xmit = ip_vs_dr_xmit;
+		break;
+
+	case IP_VS_CONN_F_LOCALNODE:
+		cp->packet_xmit = ip_vs_null_xmit;
+		break;
+
+	case IP_VS_CONN_F_BYPASS:
+		cp->packet_xmit = ip_vs_bypass_xmit;
+		break;
+	}
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
+{
+	switch (IP_VS_FWD_METHOD(cp)) {
+	case IP_VS_CONN_F_MASQ:
+		cp->packet_xmit = ip_vs_nat_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_TUNNEL:
+		cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_DROUTE:
+		cp->packet_xmit = ip_vs_dr_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_LOCALNODE:
+		cp->packet_xmit = ip_vs_null_xmit;
+		break;
+
+	case IP_VS_CONN_F_BYPASS:
+		cp->packet_xmit = ip_vs_bypass_xmit_v6;
+		break;
+	}
+}
+#endif
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->activeconns)
+		+ atomic_read(&dest->inactconns);
+}
+
+/*
+ *	Bind a connection entry with a virtual service destination
+ *	Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+	/* if dest is NULL, then return directly */
+	if (!dest)
+		return;
+
+	/* Increase the refcnt counter of the dest */
+	atomic_inc(&dest->refcnt);
+
+	/* Bind with the destination and its corresponding transmitter */
+	if ((cp->flags & IP_VS_CONN_F_SYNC) &&
+	    (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+		/* if the connection is not template and is created
+		 * by sync, preserve the activity flag.
+		 */
+		cp->flags |= atomic_read(&dest->conn_flags) &
+			     (~IP_VS_CONN_F_INACTIVE);
+	else
+		cp->flags |= atomic_read(&dest->conn_flags);
+	cp->dest = dest;
+
+	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
+		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		      "dest->refcnt:%d\n",
+		      ip_vs_proto_name(cp->protocol),
+		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+		      ip_vs_fwd_tag(cp), cp->state,
+		      cp->flags, atomic_read(&cp->refcnt),
+		      atomic_read(&dest->refcnt));
+
+	/* Update the connection counters */
+	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+		/* It is a normal connection, so increase the inactive
+		   connection counter because it is in TCP SYNRECV
+		   state (inactive) or other protocol inacive state */
+		if ((cp->flags & IP_VS_CONN_F_SYNC) &&
+		    (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+			atomic_inc(&dest->activeconns);
+		else
+			atomic_inc(&dest->inactconns);
+	} else {
+		/* It is a persistent connection/template, so increase
+		   the peristent connection counter */
+		atomic_inc(&dest->persistconns);
+	}
+
+	if (dest->u_threshold != 0 &&
+	    ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+		dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Check if there is a destination for the connection, if so
+ * bind the connection to the destination.
+ */
+struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+{
+	struct ip_vs_dest *dest;
+
+	if ((cp) && (!cp->dest)) {
+		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
+				       &cp->vaddr, cp->vport,
+				       cp->protocol);
+		ip_vs_bind_dest(cp, dest);
+		return dest;
+	} else
+		return NULL;
+}
+
+
+/*
+ *	Unbind a connection entry with its VS destination
+ *	Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+	struct ip_vs_dest *dest = cp->dest;
+
+	if (!dest)
+		return;
+
+	IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
+		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		      "dest->refcnt:%d\n",
+		      ip_vs_proto_name(cp->protocol),
+		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+		      ip_vs_fwd_tag(cp), cp->state,
+		      cp->flags, atomic_read(&cp->refcnt),
+		      atomic_read(&dest->refcnt));
+
+	/* Update the connection counters */
+	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+		/* It is a normal connection, so decrease the inactconns
+		   or activeconns counter */
+		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+			atomic_dec(&dest->inactconns);
+		} else {
+			atomic_dec(&dest->activeconns);
+		}
+	} else {
+		/* It is a persistent connection/template, so decrease
+		   the peristent connection counter */
+		atomic_dec(&dest->persistconns);
+	}
+
+	if (dest->l_threshold != 0) {
+		if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	} else if (dest->u_threshold != 0) {
+		if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	} else {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	}
+
+	/*
+	 * Simply decrease the refcnt of the dest, because the
+	 * dest will be either in service's destination list
+	 * or in the trash.
+	 */
+	atomic_dec(&dest->refcnt);
+}
+
+
+/*
+ *	Checking if the destination of a connection template is available.
+ *	If available, return 1, otherwise invalidate this connection
+ *	template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+	struct ip_vs_dest *dest = ct->dest;
+
+	/*
+	 * Checking the dest server status.
+	 */
+	if ((dest == NULL) ||
+	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+	    (sysctl_ip_vs_expire_quiescent_template &&
+	     (atomic_read(&dest->weight) == 0))) {
+		IP_VS_DBG_BUF(9, "check_template: dest not available for "
+			      "protocol %s s:%s:%d v:%s:%d "
+			      "-> d:%s:%d\n",
+			      ip_vs_proto_name(ct->protocol),
+			      IP_VS_DBG_ADDR(ct->af, &ct->caddr),
+			      ntohs(ct->cport),
+			      IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
+			      ntohs(ct->vport),
+			      IP_VS_DBG_ADDR(ct->af, &ct->daddr),
+			      ntohs(ct->dport));
+
+		/*
+		 * Invalidate the connection template
+		 */
+		if (ct->vport != htons(0xffff)) {
+			if (ip_vs_conn_unhash(ct)) {
+				ct->dport = htons(0xffff);
+				ct->vport = htons(0xffff);
+				ct->cport = 0;
+				ip_vs_conn_hash(ct);
+			}
+		}
+
+		/*
+		 * Simply decrease the refcnt of the template,
+		 * don't restart its timer.
+		 */
+		atomic_dec(&ct->refcnt);
+		return 0;
+	}
+	return 1;
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+
+	cp->timeout = 60*HZ;
+
+	/*
+	 *	hey, I'm using it
+	 */
+	atomic_inc(&cp->refcnt);
+
+	/*
+	 *	do I control anybody?
+	 */
+	if (atomic_read(&cp->n_control))
+		goto expire_later;
+
+	/*
+	 *	unhash it if it is hashed in the conn table
+	 */
+	if (!ip_vs_conn_unhash(cp))
+		goto expire_later;
+
+	/*
+	 *	refcnt==1 implies I'm the only one referrer
+	 */
+	if (likely(atomic_read(&cp->refcnt) == 1)) {
+		/* delete the timer if it is activated by other users */
+		if (timer_pending(&cp->timer))
+			del_timer(&cp->timer);
+
+		/* does anybody control me? */
+		if (cp->control)
+			ip_vs_control_del(cp);
+
+		if (unlikely(cp->app != NULL))
+			ip_vs_unbind_app(cp);
+		ip_vs_unbind_dest(cp);
+		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+			atomic_dec(&ip_vs_conn_no_cport_cnt);
+		atomic_dec(&ip_vs_conn_count);
+
+		kmem_cache_free(ip_vs_conn_cachep, cp);
+		return;
+	}
+
+	/* hash it back to the table */
+	ip_vs_conn_hash(cp);
+
+  expire_later:
+	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
+		  atomic_read(&cp->refcnt)-1,
+		  atomic_read(&cp->n_control));
+
+	ip_vs_conn_put(cp);
+}
+
+
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+	if (del_timer(&cp->timer))
+		mod_timer(&cp->timer, jiffies);
+}
+
+
+/*
+ *	Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
+	       const union nf_inet_addr *vaddr, __be16 vport,
+	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
+	       struct ip_vs_dest *dest)
+{
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+	if (cp == NULL) {
+		IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&cp->c_list);
+	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+	cp->af		   = af;
+	cp->protocol	   = proto;
+	ip_vs_addr_copy(af, &cp->caddr, caddr);
+	cp->cport	   = cport;
+	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
+	cp->vport	   = vport;
+	ip_vs_addr_copy(af, &cp->daddr, daddr);
+	cp->dport          = dport;
+	cp->flags	   = flags;
+	spin_lock_init(&cp->lock);
+
+	/*
+	 * Set the entry is referenced by the current thread before hashing
+	 * it in the table, so that other thread run ip_vs_random_dropentry
+	 * but cannot drop this entry.
+	 */
+	atomic_set(&cp->refcnt, 1);
+
+	atomic_set(&cp->n_control, 0);
+	atomic_set(&cp->in_pkts, 0);
+
+	atomic_inc(&ip_vs_conn_count);
+	if (flags & IP_VS_CONN_F_NO_CPORT)
+		atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+	/* Bind the connection with a destination server */
+	ip_vs_bind_dest(cp, dest);
+
+	/* Set its state and timeout */
+	cp->state = 0;
+	cp->timeout = 3*HZ;
+
+	/* Bind its packet transmitter */
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_bind_xmit_v6(cp);
+	else
+#endif
+		ip_vs_bind_xmit(cp);
+
+	if (unlikely(pp && atomic_read(&pp->appcnt)))
+		ip_vs_bind_app(cp, pp);
+
+	/* Hash it in the ip_vs_conn_tab finally */
+	ip_vs_conn_hash(cp);
+
+	return cp;
+}
+
+
+/*
+ *	/proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+
+	for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+		ct_read_lock_bh(idx);
+		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+			if (pos-- == 0) {
+				seq->private = &ip_vs_conn_tab[idx];
+				return cp;
+			}
+		}
+		ct_read_unlock_bh(idx);
+	}
+
+	return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	seq->private = NULL;
+	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_vs_conn *cp = v;
+	struct list_head *e, *l = seq->private;
+	int idx;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_conn_array(seq, 0);
+
+	/* more on same hash chain? */
+	if ((e = cp->c_list.next) != l)
+		return list_entry(e, struct ip_vs_conn, c_list);
+
+	idx = l - ip_vs_conn_tab;
+	ct_read_unlock_bh(idx);
+
+	while (++idx < IP_VS_CONN_TAB_SIZE) {
+		ct_read_lock_bh(idx);
+		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+			seq->private = &ip_vs_conn_tab[idx];
+			return cp;
+		}
+		ct_read_unlock_bh(idx);
+	}
+	seq->private = NULL;
+	return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+{
+	struct list_head *l = seq->private;
+
+	if (l)
+		ct_read_unlock_bh(l - ip_vs_conn_tab);
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+	else {
+		const struct ip_vs_conn *cp = v;
+
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			seq_printf(seq,
+				"%-3s " NIP6_FMT " %04X " NIP6_FMT
+				" %04X " NIP6_FMT " %04X %-11s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				NIP6(cp->caddr.in6), ntohs(cp->cport),
+				NIP6(cp->vaddr.in6), ntohs(cp->vport),
+				NIP6(cp->daddr.in6), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				(cp->timer.expires-jiffies)/HZ);
+		else
+#endif
+			seq_printf(seq,
+				"%-3s %08X %04X %08X %04X"
+				" %08X %04X %-11s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				ntohl(cp->caddr.ip), ntohs(cp->cport),
+				ntohl(cp->vaddr.ip), ntohs(cp->vport),
+				ntohl(cp->daddr.ip), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				(cp->timer.expires-jiffies)/HZ);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_conn_seq_ops = {
+	.start = ip_vs_conn_seq_start,
+	.next  = ip_vs_conn_seq_next,
+	.stop  = ip_vs_conn_seq_stop,
+	.show  = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ip_vs_conn_seq_ops);
+}
+
+static const struct file_operations ip_vs_conn_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_conn_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static const char *ip_vs_origin_name(unsigned flags)
+{
+	if (flags & IP_VS_CONN_F_SYNC)
+		return "SYNC";
+	else
+		return "LOCAL";
+}
+
+static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
+{
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
+	else {
+		const struct ip_vs_conn *cp = v;
+
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			seq_printf(seq,
+				"%-3s " NIP6_FMT " %04X " NIP6_FMT
+				" %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				NIP6(cp->caddr.in6), ntohs(cp->cport),
+				NIP6(cp->vaddr.in6), ntohs(cp->vport),
+				NIP6(cp->daddr.in6), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_origin_name(cp->flags),
+				(cp->timer.expires-jiffies)/HZ);
+		else
+#endif
+			seq_printf(seq,
+				"%-3s %08X %04X %08X %04X "
+				"%08X %04X %-11s %-6s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				ntohl(cp->caddr.ip), ntohs(cp->cport),
+				ntohl(cp->vaddr.ip), ntohs(cp->vport),
+				ntohl(cp->daddr.ip), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_origin_name(cp->flags),
+				(cp->timer.expires-jiffies)/HZ);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_conn_sync_seq_ops = {
+	.start = ip_vs_conn_seq_start,
+	.next  = ip_vs_conn_seq_next,
+	.stop  = ip_vs_conn_seq_stop,
+	.show  = ip_vs_conn_sync_seq_show,
+};
+
+static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ip_vs_conn_sync_seq_ops);
+}
+
+static const struct file_operations ip_vs_conn_sync_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_conn_sync_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+#endif
+
+
+/*
+ *      Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+	/*
+	 * The drop rate array needs tuning for real environments.
+	 * Called from timer bh only => no locking
+	 */
+	static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+	static char todrop_counter[9] = {0};
+	int i;
+
+	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+	   This will leave enough time for normal connection to get
+	   through. */
+	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+		return 0;
+
+	/* Don't drop the entry if its number of incoming packets is not
+	   located in [0, 8] */
+	i = atomic_read(&cp->in_pkts);
+	if (i > 8 || i < 0) return 0;
+
+	if (!todrop_rate[i]) return 0;
+	if (--todrop_counter[i] > 0) return 0;
+
+	todrop_counter[i] = todrop_rate[i];
+	return 1;
+}
+
+/* Called from keventd and must protect itself from softirqs */
+void ip_vs_random_dropentry(void)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+
+	/*
+	 * Randomly scan 1/32 of the whole table every second
+	 */
+	for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
+		unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock_bh(hash);
+
+		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+				/* connection template */
+				continue;
+
+			if (cp->protocol == IPPROTO_TCP) {
+				switch(cp->state) {
+				case IP_VS_TCP_S_SYN_RECV:
+				case IP_VS_TCP_S_SYNACK:
+					break;
+
+				case IP_VS_TCP_S_ESTABLISHED:
+					if (todrop_entry(cp))
+						break;
+					continue;
+
+				default:
+					continue;
+				}
+			} else {
+				if (!todrop_entry(cp))
+					continue;
+			}
+
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (cp->control) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(cp->control);
+			}
+		}
+		ct_write_unlock_bh(hash);
+	}
+}
+
+
+/*
+ *      Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+
+  flush_again:
+	for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock_bh(idx);
+
+		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (cp->control) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(cp->control);
+			}
+		}
+		ct_write_unlock_bh(idx);
+	}
+
+	/* the counter may be not NULL, because maybe some conn entries
+	   are run by slow timer handler or unhashed but still referred */
+	if (atomic_read(&ip_vs_conn_count) != 0) {
+		schedule();
+		goto flush_again;
+	}
+}
+
+
+int __init ip_vs_conn_init(void)
+{
+	int idx;
+
+	/*
+	 * Allocate the connection hash table and initialize its list heads
+	 */
+	ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+	if (!ip_vs_conn_tab)
+		return -ENOMEM;
+
+	/* Allocate ip_vs_conn slab cache */
+	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+					      sizeof(struct ip_vs_conn), 0,
+					      SLAB_HWCACHE_ALIGN, NULL);
+	if (!ip_vs_conn_cachep) {
+		vfree(ip_vs_conn_tab);
+		return -ENOMEM;
+	}
+
+	IP_VS_INFO("Connection hash table configured "
+		   "(size=%d, memory=%ldKbytes)\n",
+		   IP_VS_CONN_TAB_SIZE,
+		   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+		  sizeof(struct ip_vs_conn));
+
+	for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+	}
+
+	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
+		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+	}
+
+	proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+	proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+
+	/* calculate the random value for connection hash */
+	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+	return 0;
+}
+
+
+void ip_vs_conn_cleanup(void)
+{
+	/* flush all the connection entries first */
+	ip_vs_conn_flush();
+
+	/* Release the empty cache */
+	kmem_cache_destroy(ip_vs_conn_cachep);
+	proc_net_remove(&init_net, "ip_vs_conn");
+	proc_net_remove(&init_net, "ip_vs_conn_sync");
+	vfree(ip_vs_conn_tab);
+}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
new file mode 100644
index 00000000000..958abf3e5f8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -0,0 +1,1542 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ *	Paul `Rusty' Russell		properly handle non-linear skbs
+ *	Harald Welte			don't use nfcache
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#endif
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_skb_replace);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph)          (((icmph)->un).echo.id)
+#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
+
+const char *ip_vs_proto_name(unsigned proto)
+{
+	static char buf[20];
+
+	switch (proto) {
+	case IPPROTO_IP:
+		return "IP";
+	case IPPROTO_UDP:
+		return "UDP";
+	case IPPROTO_TCP:
+		return "TCP";
+	case IPPROTO_ICMP:
+		return "ICMP";
+#ifdef CONFIG_IP_VS_IPV6
+	case IPPROTO_ICMPV6:
+		return "ICMPv6";
+#endif
+	default:
+		sprintf(buf, "IP_%d", proto);
+		return buf;
+	}
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+	while (--rows >= 0)
+		INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest = cp->dest;
+	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		spin_lock(&dest->stats.lock);
+		dest->stats.ustats.inpkts++;
+		dest->stats.ustats.inbytes += skb->len;
+		spin_unlock(&dest->stats.lock);
+
+		spin_lock(&dest->svc->stats.lock);
+		dest->svc->stats.ustats.inpkts++;
+		dest->svc->stats.ustats.inbytes += skb->len;
+		spin_unlock(&dest->svc->stats.lock);
+
+		spin_lock(&ip_vs_stats.lock);
+		ip_vs_stats.ustats.inpkts++;
+		ip_vs_stats.ustats.inbytes += skb->len;
+		spin_unlock(&ip_vs_stats.lock);
+	}
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest = cp->dest;
+	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		spin_lock(&dest->stats.lock);
+		dest->stats.ustats.outpkts++;
+		dest->stats.ustats.outbytes += skb->len;
+		spin_unlock(&dest->stats.lock);
+
+		spin_lock(&dest->svc->stats.lock);
+		dest->svc->stats.ustats.outpkts++;
+		dest->svc->stats.ustats.outbytes += skb->len;
+		spin_unlock(&dest->svc->stats.lock);
+
+		spin_lock(&ip_vs_stats.lock);
+		ip_vs_stats.ustats.outpkts++;
+		ip_vs_stats.ustats.outbytes += skb->len;
+		spin_unlock(&ip_vs_stats.lock);
+	}
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+	spin_lock(&cp->dest->stats.lock);
+	cp->dest->stats.ustats.conns++;
+	spin_unlock(&cp->dest->stats.lock);
+
+	spin_lock(&svc->stats.lock);
+	svc->stats.ustats.conns++;
+	spin_unlock(&svc->stats.lock);
+
+	spin_lock(&ip_vs_stats.lock);
+	ip_vs_stats.ustats.conns++;
+	spin_unlock(&ip_vs_stats.lock);
+}
+
+
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+		const struct sk_buff *skb,
+		struct ip_vs_protocol *pp)
+{
+	if (unlikely(!pp->state_transition))
+		return 0;
+	return pp->state_transition(cp, direction, skb, pp);
+}
+
+
+/*
+ *  IPVS persistent scheduling function
+ *  It creates a connection entry according to its template if exists,
+ *  or selects a server and creates a connection entry plus a template.
+ *  Locking: we are svc user (svc->refcnt), so we hold all dests too
+ *  Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+		    const struct sk_buff *skb,
+		    __be16 ports[2])
+{
+	struct ip_vs_conn *cp = NULL;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_dest *dest;
+	struct ip_vs_conn *ct;
+	__be16  dport;			/* destination port to forward */
+	union nf_inet_addr snet;	/* source network of the client,
+					   after masking */
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	/* Mask saddr with the netmask to adjust template granularity */
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
+	else
+#endif
+		snet.ip = iph.saddr.ip & svc->netmask;
+
+	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
+		      "mnet %s\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
+		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+		      IP_VS_DBG_ADDR(svc->af, &snet));
+
+	/*
+	 * As far as we know, FTP is a very complicated network protocol, and
+	 * it uses control connection and data connections. For active FTP,
+	 * FTP server initialize data connection to the client, its source port
+	 * is often 20. For passive FTP, FTP server tells the clients the port
+	 * that it passively listens to,  and the client issues the data
+	 * connection. In the tunneling or direct routing mode, the load
+	 * balancer is on the client-to-server half of connection, the port
+	 * number is unknown to the load balancer. So, a conn template like
+	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+	 * is created for other persistent services.
+	 */
+	if (ports[1] == svc->port) {
+		/* Check if a template already exists */
+		if (svc->port != FTPPORT)
+			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
+					     &iph.daddr, ports[1]);
+		else
+			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
+					     &iph.daddr, 0);
+
+		if (!ct || !ip_vs_check_template(ct)) {
+			/*
+			 * No template found or the dest of the connection
+			 * template is not available.
+			 */
+			dest = svc->scheduler->schedule(svc, skb);
+			if (dest == NULL) {
+				IP_VS_DBG(1, "p-schedule: no dest found.\n");
+				return NULL;
+			}
+
+			/*
+			 * Create a template like <protocol,caddr,0,
+			 * vaddr,vport,daddr,dport> for non-ftp service,
+			 * and <protocol,caddr,0,vaddr,0,daddr,0>
+			 * for ftp service.
+			 */
+			if (svc->port != FTPPORT)
+				ct = ip_vs_conn_new(svc->af, iph.protocol,
+						    &snet, 0,
+						    &iph.daddr,
+						    ports[1],
+						    &dest->addr, dest->port,
+						    IP_VS_CONN_F_TEMPLATE,
+						    dest);
+			else
+				ct = ip_vs_conn_new(svc->af, iph.protocol,
+						    &snet, 0,
+						    &iph.daddr, 0,
+						    &dest->addr, 0,
+						    IP_VS_CONN_F_TEMPLATE,
+						    dest);
+			if (ct == NULL)
+				return NULL;
+
+			ct->timeout = svc->timeout;
+		} else {
+			/* set destination with the found template */
+			dest = ct->dest;
+		}
+		dport = dest->port;
+	} else {
+		/*
+		 * Note: persistent fwmark-based services and persistent
+		 * port zero service are handled here.
+		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+		 */
+		if (svc->fwmark) {
+			union nf_inet_addr fwmark = {
+				.all = { 0, 0, 0, htonl(svc->fwmark) }
+			};
+
+			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
+					     &fwmark, 0);
+		} else
+			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
+					     &iph.daddr, 0);
+
+		if (!ct || !ip_vs_check_template(ct)) {
+			/*
+			 * If it is not persistent port zero, return NULL,
+			 * otherwise create a connection template.
+			 */
+			if (svc->port)
+				return NULL;
+
+			dest = svc->scheduler->schedule(svc, skb);
+			if (dest == NULL) {
+				IP_VS_DBG(1, "p-schedule: no dest found.\n");
+				return NULL;
+			}
+
+			/*
+			 * Create a template according to the service
+			 */
+			if (svc->fwmark) {
+				union nf_inet_addr fwmark = {
+					.all = { 0, 0, 0, htonl(svc->fwmark) }
+				};
+
+				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
+						    &snet, 0,
+						    &fwmark, 0,
+						    &dest->addr, 0,
+						    IP_VS_CONN_F_TEMPLATE,
+						    dest);
+			} else
+				ct = ip_vs_conn_new(svc->af, iph.protocol,
+						    &snet, 0,
+						    &iph.daddr, 0,
+						    &dest->addr, 0,
+						    IP_VS_CONN_F_TEMPLATE,
+						    dest);
+			if (ct == NULL)
+				return NULL;
+
+			ct->timeout = svc->timeout;
+		} else {
+			/* set destination with the found template */
+			dest = ct->dest;
+		}
+		dport = ports[1];
+	}
+
+	/*
+	 *    Create a new connection according to the template
+	 */
+	cp = ip_vs_conn_new(svc->af, iph.protocol,
+			    &iph.saddr, ports[0],
+			    &iph.daddr, ports[1],
+			    &dest->addr, dport,
+			    0,
+			    dest);
+	if (cp == NULL) {
+		ip_vs_conn_put(ct);
+		return NULL;
+	}
+
+	/*
+	 *    Add its control
+	 */
+	ip_vs_control_add(cp, ct);
+	ip_vs_conn_put(ct);
+
+	ip_vs_conn_stats(cp, svc);
+	return cp;
+}
+
+
+/*
+ *  IPVS main scheduling function
+ *  It selects a server according to the virtual service, and
+ *  creates a connection entry.
+ *  Protocols supported: TCP, UDP
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_conn *cp = NULL;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_dest *dest;
+	__be16 _ports[2], *pptr;
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	/*
+	 *    Persistent service
+	 */
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+		return ip_vs_sched_persist(svc, skb, pptr);
+
+	/*
+	 *    Non-persistent service
+	 */
+	if (!svc->fwmark && pptr[1] != svc->port) {
+		if (!svc->port)
+			IP_VS_ERR("Schedule: port zero only supported "
+				  "in persistent services, "
+				  "check your ipvs configuration\n");
+		return NULL;
+	}
+
+	dest = svc->scheduler->schedule(svc, skb);
+	if (dest == NULL) {
+		IP_VS_DBG(1, "Schedule: no dest found.\n");
+		return NULL;
+	}
+
+	/*
+	 *    Create a connection entry.
+	 */
+	cp = ip_vs_conn_new(svc->af, iph.protocol,
+			    &iph.saddr, pptr[0],
+			    &iph.daddr, pptr[1],
+			    &dest->addr, dest->port ? dest->port : pptr[1],
+			    0,
+			    dest);
+	if (cp == NULL)
+		return NULL;
+
+	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
+		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+		      ip_vs_fwd_tag(cp),
+		      IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
+		      cp->flags, atomic_read(&cp->refcnt));
+
+	ip_vs_conn_stats(cp, svc);
+	return cp;
+}
+
+
+/*
+ *  Pass or drop the packet.
+ *  Called by ip_vs_in, when the virtual service is available but
+ *  no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+		struct ip_vs_protocol *pp)
+{
+	__be16 _ports[2], *pptr;
+	struct ip_vs_iphdr iph;
+	int unicast;
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+	if (pptr == NULL) {
+		ip_vs_service_put(svc);
+		return NF_DROP;
+	}
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
+	else
+#endif
+		unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+
+	/* if it is fwmark-based service, the cache_bypass sysctl is up
+	   and the destination is a non-local unicast, then create
+	   a cache_bypass connection entry */
+	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
+		int ret, cs;
+		struct ip_vs_conn *cp;
+		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
+
+		ip_vs_service_put(svc);
+
+		/* create a new connection entry */
+		IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
+		cp = ip_vs_conn_new(svc->af, iph.protocol,
+				    &iph.saddr, pptr[0],
+				    &iph.daddr, pptr[1],
+				    &daddr, 0,
+				    IP_VS_CONN_F_BYPASS,
+				    NULL);
+		if (cp == NULL)
+			return NF_DROP;
+
+		/* statistics */
+		ip_vs_in_stats(cp, skb);
+
+		/* set state */
+		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+
+		/* transmit the first SYN packet */
+		ret = cp->packet_xmit(skb, cp, pp);
+		/* do not touch skb anymore */
+
+		atomic_inc(&cp->in_pkts);
+		ip_vs_conn_put(cp);
+		return ret;
+	}
+
+	/*
+	 * When the virtual ftp service is presented, packets destined
+	 * for other services on the VIP may get here (except services
+	 * listed in the ipvs table), pass the packets, because it is
+	 * not ipvs job to decide to drop the packets.
+	 */
+	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
+		ip_vs_service_put(svc);
+		return NF_ACCEPT;
+	}
+
+	ip_vs_service_put(svc);
+
+	/*
+	 * Notify the client that the destination is unreachable, and
+	 * release the socket buffer.
+	 * Since it is in IP layer, the TCP socket is not actually
+	 * created, the TCP RST packet cannot be sent, instead that
+	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+	 */
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
+			    skb->dev);
+	else
+#endif
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	return NF_DROP;
+}
+
+
+/*
+ *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
+ *      chain, and is used for VS/NAT.
+ *      It detects packets for VS/NAT connections and sends the packets
+ *      immediately. This can avoid that iptable_nat mangles the packets
+ *      for VS/NAT.
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	if (!skb->ipvs_property)
+		return NF_ACCEPT;
+	/* The packet was sent from IPVS, exit this chain */
+	return NF_STOP;
+}
+
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+	int err = ip_defrag(skb, user);
+
+	if (!err)
+		ip_send_check(ip_hdr(skb));
+
+	return err;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
+{
+	/* TODO IPv6: Find out what to do here for IPv6 */
+	return 0;
+}
+#endif
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    struct ip_vs_conn *cp, int inout)
+{
+	struct iphdr *iph	 = ip_hdr(skb);
+	unsigned int icmp_offset = iph->ihl*4;
+	struct icmphdr *icmph	 = (struct icmphdr *)(skb_network_header(skb) +
+						      icmp_offset);
+	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
+
+	if (inout) {
+		iph->saddr = cp->vaddr.ip;
+		ip_send_check(iph);
+		ciph->daddr = cp->vaddr.ip;
+		ip_send_check(ciph);
+	} else {
+		iph->daddr = cp->daddr.ip;
+		ip_send_check(iph);
+		ciph->saddr = cp->daddr.ip;
+		ip_send_check(ciph);
+	}
+
+	/* the TCP/UDP port */
+	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
+		__be16 *ports = (void *)ciph + ciph->ihl*4;
+
+		if (inout)
+			ports[1] = cp->vport;
+		else
+			ports[0] = cp->dport;
+	}
+
+	/* And finally the ICMP checksum */
+	icmph->checksum = 0;
+	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (inout)
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered outgoing ICMP");
+	else
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered incoming ICMP");
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    struct ip_vs_conn *cp, int inout)
+{
+	struct ipv6hdr *iph	 = ipv6_hdr(skb);
+	unsigned int icmp_offset = sizeof(struct ipv6hdr);
+	struct icmp6hdr *icmph	 = (struct icmp6hdr *)(skb_network_header(skb) +
+						      icmp_offset);
+	struct ipv6hdr *ciph	 = (struct ipv6hdr *)(icmph + 1);
+
+	if (inout) {
+		iph->saddr = cp->vaddr.in6;
+		ciph->daddr = cp->vaddr.in6;
+	} else {
+		iph->daddr = cp->daddr.in6;
+		ciph->saddr = cp->daddr.in6;
+	}
+
+	/* the TCP/UDP port */
+	if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
+		__be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
+
+		if (inout)
+			ports[1] = cp->vport;
+		else
+			ports[0] = cp->dport;
+	}
+
+	/* And finally the ICMP checksum */
+	icmph->icmp6_cksum = 0;
+	/* TODO IPv6: is this correct for ICMPv6? */
+	ip_vs_checksum_complete(skb, icmp_offset);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (inout)
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered outgoing ICMPv6");
+	else
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered incoming ICMPv6");
+}
+#endif
+
+/* Handle relevant response ICMP messages - forward to the right
+ * destination host. Used for NAT and local client.
+ */
+static int handle_response_icmp(int af, struct sk_buff *skb,
+				union nf_inet_addr *snet,
+				__u8 protocol, struct ip_vs_conn *cp,
+				struct ip_vs_protocol *pp,
+				unsigned int offset, unsigned int ihl)
+{
+	unsigned int verdict = NF_DROP;
+
+	if (IP_VS_FWD_METHOD(cp) != 0) {
+		IP_VS_ERR("shouldn't reach here, because the box is on the "
+			  "half connection in the tun/dr module.\n");
+	}
+
+	/* Ensure the checksum is correct */
+	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+		/* Failed checksum! */
+		IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
+			      IP_VS_DBG_ADDR(af, snet));
+		goto out;
+	}
+
+	if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
+		offset += 2 * sizeof(__u16);
+	if (!skb_make_writable(skb, offset))
+		goto out;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+	else
+#endif
+		ip_vs_nat_icmp(skb, pp, cp, 1);
+
+	/* do the statistics and put it back */
+	ip_vs_out_stats(cp, skb);
+
+	skb->ipvs_property = 1;
+	verdict = NF_ACCEPT;
+
+out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+
+/*
+ *	Handle ICMP messages in the inside-to-outside direction (outgoing).
+ *	Find any that might be relevant, check against existing connections.
+ *	Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+{
+	struct iphdr *iph;
+	struct icmphdr	_icmph, *ic;
+	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, ihl;
+	union nf_inet_addr snet;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+			return NF_STOLEN;
+	}
+
+	iph = ip_hdr(skb);
+	offset = ihl = iph->ihl * 4;
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+		  ic->type, ntohs(icmp_id(ic)),
+		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->type != ICMP_DEST_UNREACH) &&
+	    (ic->type != ICMP_SOURCE_QUENCH) &&
+	    (ic->type != ICMP_TIME_EXCEEDED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->protocol);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+		     pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+
+	offset += cih->ihl * 4;
+
+	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	snet.ip = iph->saddr;
+	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
+				    pp, offset, ihl);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+{
+	struct ipv6hdr *iph;
+	struct icmp6hdr	_icmph, *ic;
+	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
+					   within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset;
+	union nf_inet_addr snet;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+		if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+			return NF_STOLEN;
+	}
+
+	iph = ipv6_hdr(skb);
+	offset = sizeof(struct ipv6hdr);
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
+		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
+		  NIP6(iph->saddr), NIP6(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->nexthdr);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	/* TODO: we don't support fragmentation at the moment anyways */
+	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+
+	offset += sizeof(struct ipv6hdr);
+
+	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	ipv6_addr_copy(&snet.in6, &iph->saddr);
+	return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
+				    pp, offset, sizeof(struct ipv6hdr));
+}
+#endif
+
+static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
+{
+	struct tcphdr _tcph, *th;
+
+	th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+	return th->rst;
+}
+
+/* Handle response packets: rewrite addresses and send away...
+ * Used for NAT and local client.
+ */
+static unsigned int
+handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+		struct ip_vs_conn *cp, int ihl)
+{
+	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+
+	if (!skb_make_writable(skb, ihl))
+		goto drop;
+
+	/* mangle the packet */
+	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
+		goto drop;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+	else
+#endif
+	{
+		ip_hdr(skb)->saddr = cp->vaddr.ip;
+		ip_send_check(ip_hdr(skb));
+	}
+
+	/* For policy routing, packets originating from this
+	 * machine itself may be routed differently to packets
+	 * passing through.  We want this packet to be routed as
+	 * if it came from this machine itself.  So re-compute
+	 * the routing information.
+	 */
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (ip6_route_me_harder(skb) != 0)
+			goto drop;
+	} else
+#endif
+		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			goto drop;
+
+	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+
+	ip_vs_out_stats(cp, skb);
+	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+	ip_vs_conn_put(cp);
+
+	skb->ipvs_property = 1;
+
+	LeaveFunction(11);
+	return NF_ACCEPT;
+
+drop:
+	ip_vs_conn_put(cp);
+	kfree_skb(skb);
+	return NF_STOLEN;
+}
+
+/*
+ *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *	Check if outgoing packet belongs to the established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
+	  const struct net_device *in, const struct net_device *out,
+	  int (*okfn)(struct sk_buff *))
+{
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	int af;
+
+	EnterFunction(11);
+
+	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+
+	if (skb->ipvs_property)
+		return NF_ACCEPT;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+	} else
+#endif
+		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+			int related, verdict = ip_vs_out_icmp(skb, &related);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+
+	pp = ip_vs_proto_get(iph.protocol);
+	if (unlikely(!pp))
+		return NF_ACCEPT;
+
+	/* reassemble IP fragments */
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+
+			if (related)
+				return verdict;
+
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+	} else
+#endif
+		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
+			     !pp->dont_defrag)) {
+			if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+				return NF_STOLEN;
+
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+
+	/*
+	 * Check if the packet belongs to an existing entry
+	 */
+	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+
+	if (unlikely(!cp)) {
+		if (sysctl_ip_vs_nat_icmp_send &&
+		    (pp->protocol == IPPROTO_TCP ||
+		     pp->protocol == IPPROTO_UDP)) {
+			__be16 _ports[2], *pptr;
+
+			pptr = skb_header_pointer(skb, iph.len,
+						  sizeof(_ports), _ports);
+			if (pptr == NULL)
+				return NF_ACCEPT;	/* Not for me */
+			if (ip_vs_lookup_real_service(af, iph.protocol,
+						      &iph.saddr,
+						      pptr[0])) {
+				/*
+				 * Notify the real server: there is no
+				 * existing entry if it is not RST
+				 * packet or not TCP packet.
+				 */
+				if (iph.protocol != IPPROTO_TCP
+				    || !is_tcp_reset(skb, iph.len)) {
+#ifdef CONFIG_IP_VS_IPV6
+					if (af == AF_INET6)
+						icmpv6_send(skb,
+							    ICMPV6_DEST_UNREACH,
+							    ICMPV6_PORT_UNREACH,
+							    0, skb->dev);
+					else
+#endif
+						icmp_send(skb,
+							  ICMP_DEST_UNREACH,
+							  ICMP_PORT_UNREACH, 0);
+					return NF_DROP;
+				}
+			}
+		}
+		IP_VS_DBG_PKT(12, pp, skb, 0,
+			      "packet continues traversal as normal");
+		return NF_ACCEPT;
+	}
+
+	return handle_response(af, skb, pp, cp, iph.len);
+}
+
+
+/*
+ *	Handle ICMP messages in the outside-to-inside direction (incoming).
+ *	Find any that might be relevant, check against existing connections,
+ *	forward to the right destination host if relevant.
+ *	Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+	struct iphdr *iph;
+	struct icmphdr	_icmph, *ic;
+	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, ihl, verdict;
+	union nf_inet_addr snet;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+		if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
+					    IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+			return NF_STOLEN;
+	}
+
+	iph = ip_hdr(skb);
+	offset = ihl = iph->ihl * 4;
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+		  ic->type, ntohs(icmp_id(ic)),
+		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->type != ICMP_DEST_UNREACH) &&
+	    (ic->type != ICMP_SOURCE_QUENCH) &&
+	    (ic->type != ICMP_TIME_EXCEEDED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->protocol);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+		     pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+
+	offset += cih->ihl * 4;
+
+	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
+	if (!cp) {
+		/* The packet could also belong to a local client */
+		cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+		if (cp) {
+			snet.ip = iph->saddr;
+			return handle_response_icmp(AF_INET, skb, &snet,
+						    cih->protocol, cp, pp,
+						    offset, ihl);
+		}
+		return NF_ACCEPT;
+	}
+
+	verdict = NF_DROP;
+
+	/* Ensure the checksum is correct */
+	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+		/* Failed checksum! */
+		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
+			  NIPQUAD(iph->saddr));
+		goto out;
+	}
+
+	/* do the statistics and put it back */
+	ip_vs_in_stats(cp, skb);
+	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+		offset += 2 * sizeof(__u16);
+	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
+	/* do not touch skb anymore */
+
+  out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int
+ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+	struct ipv6hdr *iph;
+	struct icmp6hdr	_icmph, *ic;
+	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
+					   within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, verdict;
+	union nf_inet_addr snet;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+		if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
+					       IP_DEFRAG_VS_IN :
+					       IP_DEFRAG_VS_FWD))
+			return NF_STOLEN;
+	}
+
+	iph = ipv6_hdr(skb);
+	offset = sizeof(struct ipv6hdr);
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
+		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
+		  NIP6(iph->saddr), NIP6(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->nexthdr);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	/* TODO: we don't support fragmentation at the moment anyways */
+	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+
+	offset += sizeof(struct ipv6hdr);
+
+	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
+	if (!cp) {
+		/* The packet could also belong to a local client */
+		cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+		if (cp) {
+			ipv6_addr_copy(&snet.in6, &iph->saddr);
+			return handle_response_icmp(AF_INET6, skb, &snet,
+						    cih->nexthdr,
+						    cp, pp, offset,
+						    sizeof(struct ipv6hdr));
+		}
+		return NF_ACCEPT;
+	}
+
+	verdict = NF_DROP;
+
+	/* do the statistics and put it back */
+	ip_vs_in_stats(cp, skb);
+	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
+		offset += 2 * sizeof(__u16);
+	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
+	/* do not touch skb anymore */
+
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+#endif
+
+
+/*
+ *	Check if it's for virtual services, look it up,
+ *	and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
+	 const struct net_device *in, const struct net_device *out,
+	 int (*okfn)(struct sk_buff *))
+{
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	int ret, restart, af;
+
+	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	/*
+	 *	Big tappo: only PACKET_HOST, including loopback for local client
+	 *	Don't handle local packets on IPv6 for now
+	 */
+	if (unlikely(skb->pkt_type != PACKET_HOST)) {
+		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
+			      skb->pkt_type,
+			      iph.protocol,
+			      IP_VS_DBG_ADDR(af, &iph.daddr));
+		return NF_ACCEPT;
+	}
+
+	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+		int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+
+		if (related)
+			return verdict;
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+	}
+
+	/* Protocol supported? */
+	pp = ip_vs_proto_get(iph.protocol);
+	if (unlikely(!pp))
+		return NF_ACCEPT;
+
+	/*
+	 * Check if the packet belongs to an existing connection entry
+	 */
+	cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
+
+	if (unlikely(!cp)) {
+		int v;
+
+		/* For local client packets, it could be a response */
+		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+		if (cp)
+			return handle_response(af, skb, pp, cp, iph.len);
+
+		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
+			return v;
+	}
+
+	if (unlikely(!cp)) {
+		/* sorry, all this trouble for a no-hit :) */
+		IP_VS_DBG_PKT(12, pp, skb, 0,
+			      "packet continues traversal as normal");
+		return NF_ACCEPT;
+	}
+
+	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+
+	/* Check the server status */
+	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		/* the destination server is not available */
+
+		if (sysctl_ip_vs_expire_nodest_conn) {
+			/* try to expire the connection immediately */
+			ip_vs_conn_expire_now(cp);
+		}
+		/* don't restart its timer, and silently
+		   drop the packet. */
+		__ip_vs_conn_put(cp);
+		return NF_DROP;
+	}
+
+	ip_vs_in_stats(cp, skb);
+	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+	if (cp->packet_xmit)
+		ret = cp->packet_xmit(skb, cp, pp);
+		/* do not touch skb anymore */
+	else {
+		IP_VS_DBG_RL("warning: packet_xmit is null");
+		ret = NF_ACCEPT;
+	}
+
+	/* Increase its packet counter and check if it is needed
+	 * to be synchronized
+	 *
+	 * Sync connection if it is about to close to
+	 * encorage the standby servers to update the connections timeout
+	 */
+	atomic_inc(&cp->in_pkts);
+	if (af == AF_INET &&
+	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	    (((cp->protocol != IPPROTO_TCP ||
+	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+	      (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+	       == sysctl_ip_vs_sync_threshold[0])) ||
+	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
+	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
+	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
+	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
+		ip_vs_sync_conn(cp);
+	cp->old_state = cp->state;
+
+	ip_vs_conn_put(cp);
+	return ret;
+}
+
+
+/*
+ *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
+ *      related packets destined for 0.0.0.0/0.
+ *      When fwmark-based virtual service is used, such as transparent
+ *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
+ *      and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	int r;
+
+	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+		return NF_ACCEPT;
+
+	return ip_vs_in_icmp(skb, &r, hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static unsigned int
+ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in, const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	int r;
+
+	if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+		return NF_ACCEPT;
+
+	return ip_vs_in_icmp_v6(skb, &r, hooknum);
+}
+#endif
+
+
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+	/* After packet filtering, forward packet through VS/DR, VS/TUN,
+	 * or VS/NAT(change destination), so that filtering rules can be
+	 * applied to IPVS. */
+	{
+		.hook		= ip_vs_in,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum        = NF_INET_LOCAL_IN,
+		.priority       = 100,
+	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_out,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum        = NF_INET_FORWARD,
+		.priority       = 100,
+	},
+	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+	{
+		.hook		= ip_vs_forward_icmp,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum        = NF_INET_FORWARD,
+		.priority       = 99,
+	},
+	/* Before the netfilter connection tracking, exit from POST_ROUTING */
+	{
+		.hook		= ip_vs_post_routing,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum        = NF_INET_POST_ROUTING,
+		.priority       = NF_IP_PRI_NAT_SRC-1,
+	},
+#ifdef CONFIG_IP_VS_IPV6
+	/* After packet filtering, forward packet through VS/DR, VS/TUN,
+	 * or VS/NAT(change destination), so that filtering rules can be
+	 * applied to IPVS. */
+	{
+		.hook		= ip_vs_in,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_LOCAL_IN,
+		.priority       = 100,
+	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_out,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_FORWARD,
+		.priority       = 100,
+	},
+	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+	{
+		.hook		= ip_vs_forward_icmp_v6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_FORWARD,
+		.priority       = 99,
+	},
+	/* Before the netfilter connection tracking, exit from POST_ROUTING */
+	{
+		.hook		= ip_vs_post_routing,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_POST_ROUTING,
+		.priority       = NF_IP6_PRI_NAT_SRC-1,
+	},
+#endif
+};
+
+
+/*
+ *	Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+	int ret;
+
+	ip_vs_estimator_init();
+
+	ret = ip_vs_control_init();
+	if (ret < 0) {
+		IP_VS_ERR("can't setup control.\n");
+		goto cleanup_estimator;
+	}
+
+	ip_vs_protocol_init();
+
+	ret = ip_vs_app_init();
+	if (ret < 0) {
+		IP_VS_ERR("can't setup application helper.\n");
+		goto cleanup_protocol;
+	}
+
+	ret = ip_vs_conn_init();
+	if (ret < 0) {
+		IP_VS_ERR("can't setup connection table.\n");
+		goto cleanup_app;
+	}
+
+	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+	if (ret < 0) {
+		IP_VS_ERR("can't register hooks.\n");
+		goto cleanup_conn;
+	}
+
+	IP_VS_INFO("ipvs loaded.\n");
+	return ret;
+
+  cleanup_conn:
+	ip_vs_conn_cleanup();
+  cleanup_app:
+	ip_vs_app_cleanup();
+  cleanup_protocol:
+	ip_vs_protocol_cleanup();
+	ip_vs_control_cleanup();
+  cleanup_estimator:
+	ip_vs_estimator_cleanup();
+	return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+	ip_vs_conn_cleanup();
+	ip_vs_app_cleanup();
+	ip_vs_protocol_cleanup();
+	ip_vs_control_cleanup();
+	ip_vs_estimator_cleanup();
+	IP_VS_INFO("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
new file mode 100644
index 00000000000..0302cf3e503
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -0,0 +1,3443 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#endif
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DEFINE_MUTEX(__ip_vs_mutex);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_svc_lock);
+
+/* lock for table with the real services */
+static DEFINE_RWLOCK(__ip_vs_rs_lock);
+
+/* lock for state and timeout tables */
+static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+
+/* lock for drop entry handling */
+static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
+
+/* lock for drop packet handling */
+static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
+
+/* 1/rate drop and drop-entry variables */
+int ip_vs_drop_rate = 0;
+int ip_vs_drop_counter = 0;
+static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
+
+/* number of virtual services */
+static int ip_vs_num_services = 0;
+
+/* sysctl variables */
+static int sysctl_ip_vs_drop_entry = 0;
+static int sysctl_ip_vs_drop_packet = 0;
+static int sysctl_ip_vs_secure_tcp = 0;
+static int sysctl_ip_vs_amemthresh = 1024;
+static int sysctl_ip_vs_am_droprate = 10;
+int sysctl_ip_vs_cache_bypass = 0;
+int sysctl_ip_vs_expire_nodest_conn = 0;
+int sysctl_ip_vs_expire_quiescent_template = 0;
+int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_nat_icmp_send = 0;
+
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+	return sysctl_ip_vs_debug_level;
+}
+#endif
+
+#ifdef CONFIG_IP_VS_IPV6
+/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
+static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+{
+	struct rt6_info *rt;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *addr,
+				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
+	};
+
+	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+	if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
+			return 1;
+
+	return 0;
+}
+#endif
+/*
+ *	update_defense_level is called from keventd and from sysctl,
+ *	so it needs to protect itself from softirqs
+ */
+static void update_defense_level(void)
+{
+	struct sysinfo i;
+	static int old_secure_tcp = 0;
+	int availmem;
+	int nomem;
+	int to_change = -1;
+
+	/* we only count free and buffered memory (in pages) */
+	si_meminfo(&i);
+	availmem = i.freeram + i.bufferram;
+	/* however in linux 2.5 the i.bufferram is total page cache size,
+	   we need adjust it */
+	/* si_swapinfo(&i); */
+	/* availmem = availmem - (i.totalswap - i.freeswap); */
+
+	nomem = (availmem < sysctl_ip_vs_amemthresh);
+
+	local_bh_disable();
+
+	/* drop_entry */
+	spin_lock(&__ip_vs_dropentry_lock);
+	switch (sysctl_ip_vs_drop_entry) {
+	case 0:
+		atomic_set(&ip_vs_dropentry, 0);
+		break;
+	case 1:
+		if (nomem) {
+			atomic_set(&ip_vs_dropentry, 1);
+			sysctl_ip_vs_drop_entry = 2;
+		} else {
+			atomic_set(&ip_vs_dropentry, 0);
+		}
+		break;
+	case 2:
+		if (nomem) {
+			atomic_set(&ip_vs_dropentry, 1);
+		} else {
+			atomic_set(&ip_vs_dropentry, 0);
+			sysctl_ip_vs_drop_entry = 1;
+		};
+		break;
+	case 3:
+		atomic_set(&ip_vs_dropentry, 1);
+		break;
+	}
+	spin_unlock(&__ip_vs_dropentry_lock);
+
+	/* drop_packet */
+	spin_lock(&__ip_vs_droppacket_lock);
+	switch (sysctl_ip_vs_drop_packet) {
+	case 0:
+		ip_vs_drop_rate = 0;
+		break;
+	case 1:
+		if (nomem) {
+			ip_vs_drop_rate = ip_vs_drop_counter
+				= sysctl_ip_vs_amemthresh /
+				(sysctl_ip_vs_amemthresh-availmem);
+			sysctl_ip_vs_drop_packet = 2;
+		} else {
+			ip_vs_drop_rate = 0;
+		}
+		break;
+	case 2:
+		if (nomem) {
+			ip_vs_drop_rate = ip_vs_drop_counter
+				= sysctl_ip_vs_amemthresh /
+				(sysctl_ip_vs_amemthresh-availmem);
+		} else {
+			ip_vs_drop_rate = 0;
+			sysctl_ip_vs_drop_packet = 1;
+		}
+		break;
+	case 3:
+		ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+		break;
+	}
+	spin_unlock(&__ip_vs_droppacket_lock);
+
+	/* secure_tcp */
+	write_lock(&__ip_vs_securetcp_lock);
+	switch (sysctl_ip_vs_secure_tcp) {
+	case 0:
+		if (old_secure_tcp >= 2)
+			to_change = 0;
+		break;
+	case 1:
+		if (nomem) {
+			if (old_secure_tcp < 2)
+				to_change = 1;
+			sysctl_ip_vs_secure_tcp = 2;
+		} else {
+			if (old_secure_tcp >= 2)
+				to_change = 0;
+		}
+		break;
+	case 2:
+		if (nomem) {
+			if (old_secure_tcp < 2)
+				to_change = 1;
+		} else {
+			if (old_secure_tcp >= 2)
+				to_change = 0;
+			sysctl_ip_vs_secure_tcp = 1;
+		}
+		break;
+	case 3:
+		if (old_secure_tcp < 2)
+			to_change = 1;
+		break;
+	}
+	old_secure_tcp = sysctl_ip_vs_secure_tcp;
+	if (to_change >= 0)
+		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+	write_unlock(&__ip_vs_securetcp_lock);
+
+	local_bh_enable();
+}
+
+
+/*
+ *	Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD	1*HZ
+static void defense_work_handler(struct work_struct *work);
+static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
+
+static void defense_work_handler(struct work_struct *work)
+{
+	update_defense_level();
+	if (atomic_read(&ip_vs_dropentry))
+		ip_vs_random_dropentry();
+
+	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+}
+
+int
+ip_vs_use_count_inc(void)
+{
+	return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+	module_put(THIS_MODULE);
+}
+
+
+/*
+ *	Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+/*
+ *	Hash table: for real service lookups
+ */
+#define IP_VS_RTAB_BITS 4
+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+
+static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
+
+/*
+ *	Trash for destinations
+ */
+static LIST_HEAD(ip_vs_dest_trash);
+
+/*
+ *	FTP & NULL virtual service counters
+ */
+static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
+static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+
+
+/*
+ *	Returns hash value for virtual service
+ */
+static __inline__ unsigned
+ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
+		  __be16 port)
+{
+	register unsigned porth = ntohs(port);
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+
+	return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+		& IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *	Returns hash value of fwmark for virtual service lookup
+ */
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+{
+	return fwmark & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *	Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ *	or in the ip_vs_svc_fwm_table by fwmark.
+ *	Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+	unsigned hash;
+
+	if (svc->flags & IP_VS_SVC_F_HASHED) {
+		IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	if (svc->fwmark == 0) {
+		/*
+		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
+		 */
+		hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
+					 svc->port);
+		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+	} else {
+		/*
+		 *  Hash it by fwmark in ip_vs_svc_fwm_table
+		 */
+		hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+	}
+
+	svc->flags |= IP_VS_SVC_F_HASHED;
+	/* increase its refcnt because it is referenced by the svc table */
+	atomic_inc(&svc->refcnt);
+	return 1;
+}
+
+
+/*
+ *	Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ *	Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+		IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	if (svc->fwmark == 0) {
+		/* Remove it from the ip_vs_svc_table table */
+		list_del(&svc->s_list);
+	} else {
+		/* Remove it from the ip_vs_svc_fwm_table table */
+		list_del(&svc->f_list);
+	}
+
+	svc->flags &= ~IP_VS_SVC_F_HASHED;
+	atomic_dec(&svc->refcnt);
+	return 1;
+}
+
+
+/*
+ *	Get service by {proto,addr,port} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+		    __be16 vport)
+{
+	unsigned hash;
+	struct ip_vs_service *svc;
+
+	/* Check for "full" addressed entries */
+	hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
+
+	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+		if ((svc->af == af)
+		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
+		    && (svc->port == vport)
+		    && (svc->protocol == protocol)) {
+			/* HIT */
+			atomic_inc(&svc->usecnt);
+			return svc;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ *	Get service by {fwmark} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+{
+	unsigned hash;
+	struct ip_vs_service *svc;
+
+	/* Check for fwmark addressed entries */
+	hash = ip_vs_svc_fwm_hashkey(fwmark);
+
+	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+		if (svc->fwmark == fwmark && svc->af == af) {
+			/* HIT */
+			atomic_inc(&svc->usecnt);
+			return svc;
+		}
+	}
+
+	return NULL;
+}
+
+struct ip_vs_service *
+ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+		  const union nf_inet_addr *vaddr, __be16 vport)
+{
+	struct ip_vs_service *svc;
+
+	read_lock(&__ip_vs_svc_lock);
+
+	/*
+	 *	Check the table hashed by fwmark first
+	 */
+	if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+		goto out;
+
+	/*
+	 *	Check the table hashed by <protocol,addr,port>
+	 *	for "full" addressed entries
+	 */
+	svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+
+	if (svc == NULL
+	    && protocol == IPPROTO_TCP
+	    && atomic_read(&ip_vs_ftpsvc_counter)
+	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+		/*
+		 * Check if ftp service entry exists, the packet
+		 * might belong to FTP data connections.
+		 */
+		svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+	}
+
+	if (svc == NULL
+	    && atomic_read(&ip_vs_nullsvc_counter)) {
+		/*
+		 * Check if the catch-all port (port zero) exists
+		 */
+		svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+	}
+
+  out:
+	read_unlock(&__ip_vs_svc_lock);
+
+	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
+		      fwmark, ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
+		      svc ? "hit" : "not hit");
+
+	return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	atomic_inc(&svc->refcnt);
+	dest->svc = svc;
+}
+
+static inline void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+	struct ip_vs_service *svc = dest->svc;
+
+	dest->svc = NULL;
+	if (atomic_dec_and_test(&svc->refcnt))
+		kfree(svc);
+}
+
+
+/*
+ *	Returns hash value for real service
+ */
+static inline unsigned ip_vs_rs_hashkey(int af,
+					    const union nf_inet_addr *addr,
+					    __be16 port)
+{
+	register unsigned porth = ntohs(port);
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+
+	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
+		& IP_VS_RTAB_MASK;
+}
+
+/*
+ *	Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ *	should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+{
+	unsigned hash;
+
+	if (!list_empty(&dest->d_list)) {
+		return 0;
+	}
+
+	/*
+	 *	Hash by proto,addr,port,
+	 *	which are the parameters of the real service.
+	 */
+	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+
+	list_add(&dest->d_list, &ip_vs_rtable[hash]);
+
+	return 1;
+}
+
+/*
+ *	UNhashes ip_vs_dest from ip_vs_rtable.
+ *	should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+	/*
+	 * Remove it from the ip_vs_rtable table.
+	 */
+	if (!list_empty(&dest->d_list)) {
+		list_del(&dest->d_list);
+		INIT_LIST_HEAD(&dest->d_list);
+	}
+
+	return 1;
+}
+
+/*
+ *	Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(int af, __u16 protocol,
+			  const union nf_inet_addr *daddr,
+			  __be16 dport)
+{
+	unsigned hash;
+	struct ip_vs_dest *dest;
+
+	/*
+	 *	Check for "full" addressed entries
+	 *	Return the first found entry
+	 */
+	hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+	read_lock(&__ip_vs_rs_lock);
+	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+		if ((dest->af == af)
+		    && ip_vs_addr_equal(af, &dest->addr, daddr)
+		    && (dest->port == dport)
+		    && ((dest->protocol == protocol) ||
+			dest->vfwmark)) {
+			/* HIT */
+			read_unlock(&__ip_vs_rs_lock);
+			return dest;
+		}
+	}
+	read_unlock(&__ip_vs_rs_lock);
+
+	return NULL;
+}
+
+/*
+ *	Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+		  __be16 dport)
+{
+	struct ip_vs_dest *dest;
+
+	/*
+	 * Find the destination for the given service
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if ((dest->af == svc->af)
+		    && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
+		    && (dest->port == dport)) {
+			/* HIT */
+			return dest;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Find destination by {daddr,dport,vaddr,protocol}
+ * Cretaed to be used in ip_vs_process_message() in
+ * the backup synchronization daemon. It finds the
+ * destination to be bound to the received connection
+ * on the backup.
+ *
+ * ip_vs_lookup_real_service() looked promissing, but
+ * seems not working as expected.
+ */
+struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+				   __be16 dport,
+				   const union nf_inet_addr *vaddr,
+				   __be16 vport, __u16 protocol)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_service *svc;
+
+	svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+	if (!svc)
+		return NULL;
+	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	if (dest)
+		atomic_inc(&dest->refcnt);
+	ip_vs_service_put(svc);
+	return dest;
+}
+
+/*
+ *  Lookup dest by {svc,addr,port} in the destination trash.
+ *  The destination trash is used to hold the destinations that are removed
+ *  from the service table but are still referenced by some conn entries.
+ *  The reason to add the destination trash is when the dest is temporary
+ *  down (either by administrator or by monitor program), the dest can be
+ *  picked back from the trash, the remaining connections to the dest can
+ *  continue, and the counting information of the dest is also useful for
+ *  scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+		     __be16 dport)
+{
+	struct ip_vs_dest *dest, *nxt;
+
+	/*
+	 * Find the destination in trash
+	 */
+	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
+			      "dest->refcnt=%d\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+			      ntohs(dest->port),
+			      atomic_read(&dest->refcnt));
+		if (dest->af == svc->af &&
+		    ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
+		    dest->port == dport &&
+		    dest->vfwmark == svc->fwmark &&
+		    dest->protocol == svc->protocol &&
+		    (svc->fwmark ||
+		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
+		      dest->vport == svc->port))) {
+			/* HIT */
+			return dest;
+		}
+
+		/*
+		 * Try to purge the destination from trash if not referenced
+		 */
+		if (atomic_read(&dest->refcnt) == 1) {
+			IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
+				      "from trash\n",
+				      dest->vfwmark,
+				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+				      ntohs(dest->port));
+			list_del(&dest->n_list);
+			ip_vs_dst_reset(dest);
+			__ip_vs_unbind_svc(dest);
+			kfree(dest);
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ *  Clean up all the destinations in the trash
+ *  Called by the ip_vs_control_cleanup()
+ *
+ *  When the ip_vs_control_clearup is activated by ipvs module exit,
+ *  the service tables must have been flushed and all the connections
+ *  are expired, and the refcnt of each destination in the trash must
+ *  be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(void)
+{
+	struct ip_vs_dest *dest, *nxt;
+
+	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+		list_del(&dest->n_list);
+		ip_vs_dst_reset(dest);
+		__ip_vs_unbind_svc(dest);
+		kfree(dest);
+	}
+}
+
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+	spin_lock_bh(&stats->lock);
+
+	memset(&stats->ustats, 0, sizeof(stats->ustats));
+	ip_vs_zero_estimator(stats);
+
+	spin_unlock_bh(&stats->lock);
+}
+
+/*
+ *	Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc,
+		    struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+{
+	int conn_flags;
+
+	/* set the weight and the flags */
+	atomic_set(&dest->weight, udest->weight);
+	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+
+	/* check if local node and update the flags */
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6) {
+		if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
+			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+				| IP_VS_CONN_F_LOCALNODE;
+		}
+	} else
+#endif
+		if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
+			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+				| IP_VS_CONN_F_LOCALNODE;
+		}
+
+	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+	} else {
+		/*
+		 *    Put the real service in ip_vs_rtable if not present.
+		 *    For now only for NAT!
+		 */
+		write_lock_bh(&__ip_vs_rs_lock);
+		ip_vs_rs_hash(dest);
+		write_unlock_bh(&__ip_vs_rs_lock);
+	}
+	atomic_set(&dest->conn_flags, conn_flags);
+
+	/* bind the service */
+	if (!dest->svc) {
+		__ip_vs_bind_svc(dest, svc);
+	} else {
+		if (dest->svc != svc) {
+			__ip_vs_unbind_svc(dest);
+			ip_vs_zero_stats(&dest->stats);
+			__ip_vs_bind_svc(dest, svc);
+		}
+	}
+
+	/* set the dest status flags */
+	dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	dest->u_threshold = udest->u_threshold;
+	dest->l_threshold = udest->l_threshold;
+}
+
+
+/*
+ *	Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
+	       struct ip_vs_dest **dest_p)
+{
+	struct ip_vs_dest *dest;
+	unsigned atype;
+
+	EnterFunction(2);
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6) {
+		atype = ipv6_addr_type(&udest->addr.in6);
+		if ((!(atype & IPV6_ADDR_UNICAST) ||
+			atype & IPV6_ADDR_LINKLOCAL) &&
+			!__ip_vs_addr_is_local_v6(&udest->addr.in6))
+			return -EINVAL;
+	} else
+#endif
+	{
+		atype = inet_addr_type(&init_net, udest->addr.ip);
+		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+			return -EINVAL;
+	}
+
+	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+	if (dest == NULL) {
+		IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
+		return -ENOMEM;
+	}
+
+	dest->af = svc->af;
+	dest->protocol = svc->protocol;
+	dest->vaddr = svc->addr;
+	dest->vport = svc->port;
+	dest->vfwmark = svc->fwmark;
+	ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
+	dest->port = udest->port;
+
+	atomic_set(&dest->activeconns, 0);
+	atomic_set(&dest->inactconns, 0);
+	atomic_set(&dest->persistconns, 0);
+	atomic_set(&dest->refcnt, 0);
+
+	INIT_LIST_HEAD(&dest->d_list);
+	spin_lock_init(&dest->dst_lock);
+	spin_lock_init(&dest->stats.lock);
+	__ip_vs_update_dest(svc, dest, udest);
+	ip_vs_new_estimator(&dest->stats);
+
+	*dest_p = dest;
+
+	LeaveFunction(2);
+	return 0;
+}
+
+
+/*
+ *	Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+	struct ip_vs_dest *dest;
+	union nf_inet_addr daddr;
+	__be16 dport = udest->port;
+	int ret;
+
+	EnterFunction(2);
+
+	if (udest->weight < 0) {
+		IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+		return -ERANGE;
+	}
+
+	if (udest->l_threshold > udest->u_threshold) {
+		IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
+			  "upper threshold\n");
+		return -ERANGE;
+	}
+
+	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+	/*
+	 * Check if the dest already exists in the list
+	 */
+	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
+	if (dest != NULL) {
+		IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
+		return -EEXIST;
+	}
+
+	/*
+	 * Check if the dest already exists in the trash and
+	 * is from the same service
+	 */
+	dest = ip_vs_trash_get_dest(svc, &daddr, dport);
+
+	if (dest != NULL) {
+		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
+			      "dest->refcnt=%d, service %u/%s:%u\n",
+			      IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
+			      atomic_read(&dest->refcnt),
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
+			      ntohs(dest->vport));
+
+		__ip_vs_update_dest(svc, dest, udest);
+
+		/*
+		 * Get the destination from the trash
+		 */
+		list_del(&dest->n_list);
+
+		ip_vs_new_estimator(&dest->stats);
+
+		write_lock_bh(&__ip_vs_svc_lock);
+
+		/*
+		 * Wait until all other svc users go away.
+		 */
+		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+		list_add(&dest->n_list, &svc->destinations);
+		svc->num_dests++;
+
+		/* call the update_service function of its scheduler */
+		if (svc->scheduler->update_service)
+			svc->scheduler->update_service(svc);
+
+		write_unlock_bh(&__ip_vs_svc_lock);
+		return 0;
+	}
+
+	/*
+	 * Allocate and initialize the dest structure
+	 */
+	ret = ip_vs_new_dest(svc, udest, &dest);
+	if (ret) {
+		return ret;
+	}
+
+	/*
+	 * Add the dest entry into the list
+	 */
+	atomic_inc(&dest->refcnt);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 * Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	list_add(&dest->n_list, &svc->destinations);
+	svc->num_dests++;
+
+	/* call the update_service function of its scheduler */
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+	struct ip_vs_dest *dest;
+	union nf_inet_addr daddr;
+	__be16 dport = udest->port;
+
+	EnterFunction(2);
+
+	if (udest->weight < 0) {
+		IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
+		return -ERANGE;
+	}
+
+	if (udest->l_threshold > udest->u_threshold) {
+		IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
+			  "upper threshold\n");
+		return -ERANGE;
+	}
+
+	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+	/*
+	 *  Lookup the destination list
+	 */
+	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
+	if (dest == NULL) {
+		IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
+		return -ENOENT;
+	}
+
+	__ip_vs_update_dest(svc, dest, udest);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/* Wait until all other svc users go away */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	/* call the update_service, because server weight may be changed */
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+{
+	ip_vs_kill_estimator(&dest->stats);
+
+	/*
+	 *  Remove it from the d-linked list with the real services.
+	 */
+	write_lock_bh(&__ip_vs_rs_lock);
+	ip_vs_rs_unhash(dest);
+	write_unlock_bh(&__ip_vs_rs_lock);
+
+	/*
+	 *  Decrease the refcnt of the dest, and free the dest
+	 *  if nobody refers to it (refcnt=0). Otherwise, throw
+	 *  the destination into the trash.
+	 */
+	if (atomic_dec_and_test(&dest->refcnt)) {
+		ip_vs_dst_reset(dest);
+		/* simply decrease svc->refcnt here, let the caller check
+		   and release the service if nobody refers to it.
+		   Only user context can release destination and service,
+		   and only one user context can update virtual service at a
+		   time, so the operation here is OK */
+		atomic_dec(&dest->svc->refcnt);
+		kfree(dest);
+	} else {
+		IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
+			      "dest->refcnt=%d\n",
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port),
+			      atomic_read(&dest->refcnt));
+		list_add(&dest->n_list, &ip_vs_dest_trash);
+		atomic_inc(&dest->refcnt);
+	}
+}
+
+
+/*
+ *	Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+				struct ip_vs_dest *dest,
+				int svcupd)
+{
+	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+	/*
+	 *  Remove it from the d-linked destination list.
+	 */
+	list_del(&dest->n_list);
+	svc->num_dests--;
+
+	/*
+	 *  Call the update_service function of its scheduler
+	 */
+	if (svcupd && svc->scheduler->update_service)
+			svc->scheduler->update_service(svc);
+}
+
+
+/*
+ *	Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+	struct ip_vs_dest *dest;
+	__be16 dport = udest->port;
+
+	EnterFunction(2);
+
+	dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+
+	if (dest == NULL) {
+		IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
+		return -ENOENT;
+	}
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 *	Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	/*
+	 *	Unlink dest from the service
+	 */
+	__ip_vs_unlink_dest(svc, dest, 1);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 *	Delete the destination
+	 */
+	__ip_vs_del_dest(dest);
+
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct ip_vs_service_user_kern *u,
+		  struct ip_vs_service **svc_p)
+{
+	int ret = 0;
+	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_service *svc = NULL;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	/* Lookup the scheduler by 'u->sched_name' */
+	sched = ip_vs_scheduler_get(u->sched_name);
+	if (sched == NULL) {
+		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+			   u->sched_name);
+		ret = -ENOENT;
+		goto out_mod_dec;
+	}
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (u->af == AF_INET6) {
+		if (!sched->supports_ipv6) {
+			ret = -EAFNOSUPPORT;
+			goto out_err;
+		}
+		if ((u->netmask < 1) || (u->netmask > 128)) {
+			ret = -EINVAL;
+			goto out_err;
+		}
+	}
+#endif
+
+	svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+	if (svc == NULL) {
+		IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* I'm the first user of the service */
+	atomic_set(&svc->usecnt, 1);
+	atomic_set(&svc->refcnt, 0);
+
+	svc->af = u->af;
+	svc->protocol = u->protocol;
+	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
+	svc->port = u->port;
+	svc->fwmark = u->fwmark;
+	svc->flags = u->flags;
+	svc->timeout = u->timeout * HZ;
+	svc->netmask = u->netmask;
+
+	INIT_LIST_HEAD(&svc->destinations);
+	rwlock_init(&svc->sched_lock);
+	spin_lock_init(&svc->stats.lock);
+
+	/* Bind the scheduler */
+	ret = ip_vs_bind_scheduler(svc, sched);
+	if (ret)
+		goto out_err;
+	sched = NULL;
+
+	/* Update the virtual service counters */
+	if (svc->port == FTPPORT)
+		atomic_inc(&ip_vs_ftpsvc_counter);
+	else if (svc->port == 0)
+		atomic_inc(&ip_vs_nullsvc_counter);
+
+	ip_vs_new_estimator(&svc->stats);
+
+	/* Count only IPv4 services for old get/setsockopt interface */
+	if (svc->af == AF_INET)
+		ip_vs_num_services++;
+
+	/* Hash the service into the service table */
+	write_lock_bh(&__ip_vs_svc_lock);
+	ip_vs_svc_hash(svc);
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	*svc_p = svc;
+	return 0;
+
+  out_err:
+	if (svc != NULL) {
+		if (svc->scheduler)
+			ip_vs_unbind_scheduler(svc);
+		if (svc->inc) {
+			local_bh_disable();
+			ip_vs_app_inc_put(svc->inc);
+			local_bh_enable();
+		}
+		kfree(svc);
+	}
+	ip_vs_scheduler_put(sched);
+
+  out_mod_dec:
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return ret;
+}
+
+
+/*
+ *	Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
+{
+	struct ip_vs_scheduler *sched, *old_sched;
+	int ret = 0;
+
+	/*
+	 * Lookup the scheduler, by 'u->sched_name'
+	 */
+	sched = ip_vs_scheduler_get(u->sched_name);
+	if (sched == NULL) {
+		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+			   u->sched_name);
+		return -ENOENT;
+	}
+	old_sched = sched;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (u->af == AF_INET6) {
+		if (!sched->supports_ipv6) {
+			ret = -EAFNOSUPPORT;
+			goto out;
+		}
+		if ((u->netmask < 1) || (u->netmask > 128)) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+#endif
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 * Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	/*
+	 * Set the flags and timeout value
+	 */
+	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+	svc->timeout = u->timeout * HZ;
+	svc->netmask = u->netmask;
+
+	old_sched = svc->scheduler;
+	if (sched != old_sched) {
+		/*
+		 * Unbind the old scheduler
+		 */
+		if ((ret = ip_vs_unbind_scheduler(svc))) {
+			old_sched = sched;
+			goto out_unlock;
+		}
+
+		/*
+		 * Bind the new scheduler
+		 */
+		if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+			/*
+			 * If ip_vs_bind_scheduler fails, restore the old
+			 * scheduler.
+			 * The main reason of failure is out of memory.
+			 *
+			 * The question is if the old scheduler can be
+			 * restored all the time. TODO: if it cannot be
+			 * restored some time, we must delete the service,
+			 * otherwise the system may crash.
+			 */
+			ip_vs_bind_scheduler(svc, old_sched);
+			old_sched = sched;
+			goto out_unlock;
+		}
+	}
+
+  out_unlock:
+	write_unlock_bh(&__ip_vs_svc_lock);
+#ifdef CONFIG_IP_VS_IPV6
+  out:
+#endif
+
+	if (old_sched)
+		ip_vs_scheduler_put(old_sched);
+
+	return ret;
+}
+
+
+/*
+ *	Delete a service from the service list
+ *	- The service must be unlinked, unlocked and not referenced!
+ *	- We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest, *nxt;
+	struct ip_vs_scheduler *old_sched;
+
+	/* Count only IPv4 services for old get/setsockopt interface */
+	if (svc->af == AF_INET)
+		ip_vs_num_services--;
+
+	ip_vs_kill_estimator(&svc->stats);
+
+	/* Unbind scheduler */
+	old_sched = svc->scheduler;
+	ip_vs_unbind_scheduler(svc);
+	if (old_sched)
+		ip_vs_scheduler_put(old_sched);
+
+	/* Unbind app inc */
+	if (svc->inc) {
+		ip_vs_app_inc_put(svc->inc);
+		svc->inc = NULL;
+	}
+
+	/*
+	 *    Unlink the whole destination list
+	 */
+	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+		__ip_vs_unlink_dest(svc, dest, 0);
+		__ip_vs_del_dest(dest);
+	}
+
+	/*
+	 *    Update the virtual service counters
+	 */
+	if (svc->port == FTPPORT)
+		atomic_dec(&ip_vs_ftpsvc_counter);
+	else if (svc->port == 0)
+		atomic_dec(&ip_vs_nullsvc_counter);
+
+	/*
+	 *    Free the service if nobody refers to it
+	 */
+	if (atomic_read(&svc->refcnt) == 0)
+		kfree(svc);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+}
+
+/*
+ *	Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+	if (svc == NULL)
+		return -EEXIST;
+
+	/*
+	 * Unhash it from the service table
+	 */
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	ip_vs_svc_unhash(svc);
+
+	/*
+	 * Wait until all the svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	__ip_vs_del_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	return 0;
+}
+
+
+/*
+ *	Flush all the virtual services
+ */
+static int ip_vs_flush(void)
+{
+	int idx;
+	struct ip_vs_service *svc, *nxt;
+
+	/*
+	 * Flush the service table hashed by <protocol,addr,port>
+	 */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
+			write_lock_bh(&__ip_vs_svc_lock);
+			ip_vs_svc_unhash(svc);
+			/*
+			 * Wait until all the svc users go away.
+			 */
+			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+			__ip_vs_del_service(svc);
+			write_unlock_bh(&__ip_vs_svc_lock);
+		}
+	}
+
+	/*
+	 * Flush the service table hashed by fwmark
+	 */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry_safe(svc, nxt,
+					 &ip_vs_svc_fwm_table[idx], f_list) {
+			write_lock_bh(&__ip_vs_svc_lock);
+			ip_vs_svc_unhash(svc);
+			/*
+			 * Wait until all the svc users go away.
+			 */
+			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+			__ip_vs_del_service(svc);
+			write_unlock_bh(&__ip_vs_svc_lock);
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ *	Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+
+	write_lock_bh(&__ip_vs_svc_lock);
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		ip_vs_zero_stats(&dest->stats);
+	}
+	ip_vs_zero_stats(&svc->stats);
+	write_unlock_bh(&__ip_vs_svc_lock);
+	return 0;
+}
+
+static int ip_vs_zero_all(void)
+{
+	int idx;
+	struct ip_vs_service *svc;
+
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			ip_vs_zero_service(svc);
+		}
+	}
+
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			ip_vs_zero_service(svc);
+		}
+	}
+
+	ip_vs_zero_stats(&ip_vs_stats);
+	return 0;
+}
+
+
+static int
+proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	int val = *valp;
+	int rc;
+
+	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (write && (*valp != val)) {
+		if ((*valp < 0) || (*valp > 3)) {
+			/* Restore the correct value */
+			*valp = val;
+		} else {
+			update_defense_level();
+		}
+	}
+	return rc;
+}
+
+
+static int
+proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	int val[2];
+	int rc;
+
+	/* backup the value first */
+	memcpy(val, valp, sizeof(val));
+
+	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+		/* Restore the correct value */
+		memcpy(valp, val, sizeof(val));
+	}
+	return rc;
+}
+
+
+/*
+ *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ */
+
+static struct ctl_table vs_vars[] = {
+	{
+		.procname	= "amemthresh",
+		.data		= &sysctl_ip_vs_amemthresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_IP_VS_DEBUG
+	{
+		.procname	= "debug_level",
+		.data		= &sysctl_ip_vs_debug_level,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.procname	= "am_droprate",
+		.data		= &sysctl_ip_vs_am_droprate,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "drop_entry",
+		.data		= &sysctl_ip_vs_drop_entry,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_defense_mode,
+	},
+	{
+		.procname	= "drop_packet",
+		.data		= &sysctl_ip_vs_drop_packet,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_defense_mode,
+	},
+	{
+		.procname	= "secure_tcp",
+		.data		= &sysctl_ip_vs_secure_tcp,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_defense_mode,
+	},
+#if 0
+	{
+		.procname	= "timeout_established",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_synsent",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_synrecv",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_finwait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_timewait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_close",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_closewait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_lastack",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_listen",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_synack",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_udp",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_icmp",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+#endif
+	{
+		.procname	= "cache_bypass",
+		.data		= &sysctl_ip_vs_cache_bypass,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "expire_nodest_conn",
+		.data		= &sysctl_ip_vs_expire_nodest_conn,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "expire_quiescent_template",
+		.data		= &sysctl_ip_vs_expire_quiescent_template,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "sync_threshold",
+		.data		= &sysctl_ip_vs_sync_threshold,
+		.maxlen		= sizeof(sysctl_ip_vs_sync_threshold),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_sync_threshold,
+	},
+	{
+		.procname	= "nat_icmp_send",
+		.data		= &sysctl_ip_vs_nat_icmp_send,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+const struct ctl_path net_vs_ctl_path[] = {
+	{ .procname = "net", .ctl_name = CTL_NET, },
+	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
+	{ .procname = "vs", },
+	{ }
+};
+EXPORT_SYMBOL_GPL(net_vs_ctl_path);
+
+static struct ctl_table_header * sysctl_header;
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+	struct list_head *table;
+	int bucket;
+};
+
+/*
+ *	Write the contents of the VS rule table to a PROCfs file.
+ *	(It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned flags)
+{
+	switch (flags & IP_VS_CONN_F_FWD_MASK) {
+	case IP_VS_CONN_F_LOCALNODE:
+		return "Local";
+	case IP_VS_CONN_F_TUNNEL:
+		return "Tunnel";
+	case IP_VS_CONN_F_DROUTE:
+		return "Route";
+	default:
+		return "Masq";
+	}
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+	struct ip_vs_iter *iter = seq->private;
+	int idx;
+	struct ip_vs_service *svc;
+
+	/* look in hash by protocol */
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			if (pos-- == 0){
+				iter->table = ip_vs_svc_table;
+				iter->bucket = idx;
+				return svc;
+			}
+		}
+	}
+
+	/* keep looking in fwmark */
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			if (pos-- == 0) {
+				iter->table = ip_vs_svc_fwm_table;
+				iter->bucket = idx;
+				return svc;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+__acquires(__ip_vs_svc_lock)
+{
+
+	read_lock_bh(&__ip_vs_svc_lock);
+	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct list_head *e;
+	struct ip_vs_iter *iter;
+	struct ip_vs_service *svc;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_info_array(seq,0);
+
+	svc = v;
+	iter = seq->private;
+
+	if (iter->table == ip_vs_svc_table) {
+		/* next service in table hashed by protocol */
+		if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
+			return list_entry(e, struct ip_vs_service, s_list);
+
+
+		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+			list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
+					    s_list) {
+				return svc;
+			}
+		}
+
+		iter->table = ip_vs_svc_fwm_table;
+		iter->bucket = -1;
+		goto scan_fwmark;
+	}
+
+	/* next service in hashed by fwmark */
+	if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
+		return list_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
+				    f_list)
+			return svc;
+	}
+
+	return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+__releases(__ip_vs_svc_lock)
+{
+	read_unlock_bh(&__ip_vs_svc_lock);
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+			"IP Virtual Server version %d.%d.%d (size=%d)\n",
+			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+		seq_puts(seq,
+			 "Prot LocalAddress:Port Scheduler Flags\n");
+		seq_puts(seq,
+			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+	} else {
+		const struct ip_vs_service *svc = v;
+		const struct ip_vs_iter *iter = seq->private;
+		const struct ip_vs_dest *dest;
+
+		if (iter->table == ip_vs_svc_table) {
+#ifdef CONFIG_IP_VS_IPV6
+			if (svc->af == AF_INET6)
+				seq_printf(seq, "%s  [" NIP6_FMT "]:%04X %s ",
+					   ip_vs_proto_name(svc->protocol),
+					   NIP6(svc->addr.in6),
+					   ntohs(svc->port),
+					   svc->scheduler->name);
+			else
+#endif
+				seq_printf(seq, "%s  %08X:%04X %s ",
+					   ip_vs_proto_name(svc->protocol),
+					   ntohl(svc->addr.ip),
+					   ntohs(svc->port),
+					   svc->scheduler->name);
+		} else {
+			seq_printf(seq, "FWM  %08X %s ",
+				   svc->fwmark, svc->scheduler->name);
+		}
+
+		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+			seq_printf(seq, "persistent %d %08X\n",
+				svc->timeout,
+				ntohl(svc->netmask));
+		else
+			seq_putc(seq, '\n');
+
+		list_for_each_entry(dest, &svc->destinations, n_list) {
+#ifdef CONFIG_IP_VS_IPV6
+			if (dest->af == AF_INET6)
+				seq_printf(seq,
+					   "  -> [" NIP6_FMT "]:%04X"
+					   "      %-7s %-6d %-10d %-10d\n",
+					   NIP6(dest->addr.in6),
+					   ntohs(dest->port),
+					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+					   atomic_read(&dest->weight),
+					   atomic_read(&dest->activeconns),
+					   atomic_read(&dest->inactconns));
+			else
+#endif
+				seq_printf(seq,
+					   "  -> %08X:%04X      "
+					   "%-7s %-6d %-10d %-10d\n",
+					   ntohl(dest->addr.ip),
+					   ntohs(dest->port),
+					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+					   atomic_read(&dest->weight),
+					   atomic_read(&dest->activeconns),
+					   atomic_read(&dest->inactconns));
+
+		}
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_info_seq_ops = {
+	.start = ip_vs_info_seq_start,
+	.next  = ip_vs_info_seq_next,
+	.stop  = ip_vs_info_seq_stop,
+	.show  = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &ip_vs_info_seq_ops,
+			sizeof(struct ip_vs_iter));
+}
+
+static const struct file_operations ip_vs_info_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_info_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+
+#endif
+
+struct ip_vs_stats ip_vs_stats = {
+	.lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
+};
+
+#ifdef CONFIG_PROC_FS
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+
+/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
+	seq_printf(seq,
+		   "   Conns  Packets  Packets            Bytes            Bytes\n");
+
+	spin_lock_bh(&ip_vs_stats.lock);
+	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
+		   ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
+		   (unsigned long long) ip_vs_stats.ustats.inbytes,
+		   (unsigned long long) ip_vs_stats.ustats.outbytes);
+
+/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+	seq_printf(seq,"%8X %8X %8X %16X %16X\n",
+			ip_vs_stats.ustats.cps,
+			ip_vs_stats.ustats.inpps,
+			ip_vs_stats.ustats.outpps,
+			ip_vs_stats.ustats.inbps,
+			ip_vs_stats.ustats.outbps);
+	spin_unlock_bh(&ip_vs_stats.lock);
+
+	return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ip_vs_stats_show, NULL);
+}
+
+static const struct file_operations ip_vs_stats_fops = {
+	.owner = THIS_MODULE,
+	.open = ip_vs_stats_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+#endif
+
+/*
+ *	Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+{
+	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+		  u->tcp_timeout,
+		  u->tcp_fin_timeout,
+		  u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	if (u->tcp_timeout) {
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+			= u->tcp_timeout * HZ;
+	}
+
+	if (u->tcp_fin_timeout) {
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+			= u->tcp_fin_timeout * HZ;
+	}
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	if (u->udp_timeout) {
+		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+			= u->udp_timeout * HZ;
+	}
+#endif
+	return 0;
+}
+
+
+#define SET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN		(sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN		(sizeof(struct ip_vs_service_user) +	\
+				 sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN		(sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN		(sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN		SVCDEST_ARG_LEN
+
+static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+	[SET_CMDID(IP_VS_SO_SET_ADD)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_EDIT)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_DEL)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_FLUSH)]		= 0,
+	[SET_CMDID(IP_VS_SO_SET_ADDDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_DELDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_EDITDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_TIMEOUT)]	= TIMEOUT_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]	= DAEMON_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]	= DAEMON_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_ZERO)]		= SERVICE_ARG_LEN,
+};
+
+static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
+				  struct ip_vs_service_user *usvc_compat)
+{
+	usvc->af		= AF_INET;
+	usvc->protocol		= usvc_compat->protocol;
+	usvc->addr.ip		= usvc_compat->addr;
+	usvc->port		= usvc_compat->port;
+	usvc->fwmark		= usvc_compat->fwmark;
+
+	/* Deep copy of sched_name is not needed here */
+	usvc->sched_name	= usvc_compat->sched_name;
+
+	usvc->flags		= usvc_compat->flags;
+	usvc->timeout		= usvc_compat->timeout;
+	usvc->netmask		= usvc_compat->netmask;
+}
+
+static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
+				   struct ip_vs_dest_user *udest_compat)
+{
+	udest->addr.ip		= udest_compat->addr;
+	udest->port		= udest_compat->port;
+	udest->conn_flags	= udest_compat->conn_flags;
+	udest->weight		= udest_compat->weight;
+	udest->u_threshold	= udest_compat->u_threshold;
+	udest->l_threshold	= udest_compat->l_threshold;
+}
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+	unsigned char arg[MAX_ARG_LEN];
+	struct ip_vs_service_user *usvc_compat;
+	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_service *svc;
+	struct ip_vs_dest_user *udest_compat;
+	struct ip_vs_dest_user_kern udest;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (len != set_arglen[SET_CMDID(cmd)]) {
+		IP_VS_ERR("set_ctl: len %u != %u\n",
+			  len, set_arglen[SET_CMDID(cmd)]);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(arg, user, len) != 0)
+		return -EFAULT;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
+		ret = -ERESTARTSYS;
+		goto out_dec;
+	}
+
+	if (cmd == IP_VS_SO_SET_FLUSH) {
+		/* Flush the virtual service */
+		ret = ip_vs_flush();
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+		/* Set timeout values for (tcp tcpfin udp) */
+		ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+		ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+		ret = stop_sync_thread(dm->state);
+		goto out_unlock;
+	}
+
+	usvc_compat = (struct ip_vs_service_user *)arg;
+	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
+
+	/* We only use the new structs internally, so copy userspace compat
+	 * structs to extended internal versions */
+	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
+	ip_vs_copy_udest_compat(&udest, udest_compat);
+
+	if (cmd == IP_VS_SO_SET_ZERO) {
+		/* if no service address is set, zero counters in all */
+		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
+			ret = ip_vs_zero_all();
+			goto out_unlock;
+		}
+	}
+
+	/* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
+	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
+		IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
+			  usvc.protocol, NIPQUAD(usvc.addr.ip),
+			  ntohs(usvc.port), usvc.sched_name);
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* Lookup the exact service by <protocol, addr, port> or fwmark */
+	if (usvc.fwmark == 0)
+		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
+					  &usvc.addr, usvc.port);
+	else
+		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+
+	if (cmd != IP_VS_SO_SET_ADD
+	    && (svc == NULL || svc->protocol != usvc.protocol)) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+
+	switch (cmd) {
+	case IP_VS_SO_SET_ADD:
+		if (svc != NULL)
+			ret = -EEXIST;
+		else
+			ret = ip_vs_add_service(&usvc, &svc);
+		break;
+	case IP_VS_SO_SET_EDIT:
+		ret = ip_vs_edit_service(svc, &usvc);
+		break;
+	case IP_VS_SO_SET_DEL:
+		ret = ip_vs_del_service(svc);
+		if (!ret)
+			goto out_unlock;
+		break;
+	case IP_VS_SO_SET_ZERO:
+		ret = ip_vs_zero_service(svc);
+		break;
+	case IP_VS_SO_SET_ADDDEST:
+		ret = ip_vs_add_dest(svc, &udest);
+		break;
+	case IP_VS_SO_SET_EDITDEST:
+		ret = ip_vs_edit_dest(svc, &udest);
+		break;
+	case IP_VS_SO_SET_DELDEST:
+		ret = ip_vs_del_dest(svc, &udest);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (svc)
+		ip_vs_service_put(svc);
+
+  out_unlock:
+	mutex_unlock(&__ip_vs_mutex);
+  out_dec:
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return ret;
+}
+
+
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+	spin_lock_bh(&src->lock);
+	memcpy(dst, &src->ustats, sizeof(*dst));
+	spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+	dst->protocol = src->protocol;
+	dst->addr = src->addr.ip;
+	dst->port = src->port;
+	dst->fwmark = src->fwmark;
+	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+	dst->flags = src->flags;
+	dst->timeout = src->timeout / HZ;
+	dst->netmask = src->netmask;
+	dst->num_dests = src->num_dests;
+	ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+
+static inline int
+__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+			    struct ip_vs_get_services __user *uptr)
+{
+	int idx, count=0;
+	struct ip_vs_service *svc;
+	struct ip_vs_service_entry entry;
+	int ret = 0;
+
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			/* Only expose IPv4 entries to old interface */
+			if (svc->af != AF_INET)
+				continue;
+
+			if (count >= get->num_services)
+				goto out;
+			memset(&entry, 0, sizeof(entry));
+			ip_vs_copy_service(&entry, svc);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				goto out;
+			}
+			count++;
+		}
+	}
+
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			/* Only expose IPv4 entries to old interface */
+			if (svc->af != AF_INET)
+				continue;
+
+			if (count >= get->num_services)
+				goto out;
+			memset(&entry, 0, sizeof(entry));
+			ip_vs_copy_service(&entry, svc);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				goto out;
+			}
+			count++;
+		}
+	}
+  out:
+	return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+			 struct ip_vs_get_dests __user *uptr)
+{
+	struct ip_vs_service *svc;
+	union nf_inet_addr addr = { .ip = get->addr };
+	int ret = 0;
+
+	if (get->fwmark)
+		svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+	else
+		svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
+					  get->port);
+
+	if (svc) {
+		int count = 0;
+		struct ip_vs_dest *dest;
+		struct ip_vs_dest_entry entry;
+
+		list_for_each_entry(dest, &svc->destinations, n_list) {
+			if (count >= get->num_dests)
+				break;
+
+			entry.addr = dest->addr.ip;
+			entry.port = dest->port;
+			entry.conn_flags = atomic_read(&dest->conn_flags);
+			entry.weight = atomic_read(&dest->weight);
+			entry.u_threshold = dest->u_threshold;
+			entry.l_threshold = dest->l_threshold;
+			entry.activeconns = atomic_read(&dest->activeconns);
+			entry.inactconns = atomic_read(&dest->inactconns);
+			entry.persistconns = atomic_read(&dest->persistconns);
+			ip_vs_copy_stats(&entry.stats, &dest->stats);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				break;
+			}
+			count++;
+		}
+		ip_vs_service_put(svc);
+	} else
+		ret = -ESRCH;
+	return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	u->tcp_timeout =
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+	u->tcp_fin_timeout =
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	u->udp_timeout =
+		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+
+#define GET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN	(sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN	(sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN	(sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN	(sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN	(sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN	(sizeof(struct ip_vs_daemon_user) * 2)
+
+static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+	[GET_CMDID(IP_VS_SO_GET_VERSION)]	= 64,
+	[GET_CMDID(IP_VS_SO_GET_INFO)]		= GET_INFO_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_SERVICES)]	= GET_SERVICES_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_SERVICE)]	= GET_SERVICE_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_DESTS)]		= GET_DESTS_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_TIMEOUT)]	= GET_TIMEOUT_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_DAEMON)]	= GET_DAEMON_ARG_LEN,
+};
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	unsigned char arg[128];
+	int ret = 0;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (*len < get_arglen[GET_CMDID(cmd)]) {
+		IP_VS_ERR("get_ctl: len %u < %u\n",
+			  *len, get_arglen[GET_CMDID(cmd)]);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+		return -EFAULT;
+
+	if (mutex_lock_interruptible(&__ip_vs_mutex))
+		return -ERESTARTSYS;
+
+	switch (cmd) {
+	case IP_VS_SO_GET_VERSION:
+	{
+		char buf[64];
+
+		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+		*len = strlen(buf)+1;
+	}
+	break;
+
+	case IP_VS_SO_GET_INFO:
+	{
+		struct ip_vs_getinfo info;
+		info.version = IP_VS_VERSION_CODE;
+		info.size = IP_VS_CONN_TAB_SIZE;
+		info.num_services = ip_vs_num_services;
+		if (copy_to_user(user, &info, sizeof(info)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	case IP_VS_SO_GET_SERVICES:
+	{
+		struct ip_vs_get_services *get;
+		int size;
+
+		get = (struct ip_vs_get_services *)arg;
+		size = sizeof(*get) +
+			sizeof(struct ip_vs_service_entry) * get->num_services;
+		if (*len != size) {
+			IP_VS_ERR("length: %u != %u\n", *len, size);
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = __ip_vs_get_service_entries(get, user);
+	}
+	break;
+
+	case IP_VS_SO_GET_SERVICE:
+	{
+		struct ip_vs_service_entry *entry;
+		struct ip_vs_service *svc;
+		union nf_inet_addr addr;
+
+		entry = (struct ip_vs_service_entry *)arg;
+		addr.ip = entry->addr;
+		if (entry->fwmark)
+			svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+		else
+			svc = __ip_vs_service_get(AF_INET, entry->protocol,
+						  &addr, entry->port);
+		if (svc) {
+			ip_vs_copy_service(entry, svc);
+			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+				ret = -EFAULT;
+			ip_vs_service_put(svc);
+		} else
+			ret = -ESRCH;
+	}
+	break;
+
+	case IP_VS_SO_GET_DESTS:
+	{
+		struct ip_vs_get_dests *get;
+		int size;
+
+		get = (struct ip_vs_get_dests *)arg;
+		size = sizeof(*get) +
+			sizeof(struct ip_vs_dest_entry) * get->num_dests;
+		if (*len != size) {
+			IP_VS_ERR("length: %u != %u\n", *len, size);
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = __ip_vs_get_dest_entries(get, user);
+	}
+	break;
+
+	case IP_VS_SO_GET_TIMEOUT:
+	{
+		struct ip_vs_timeout_user t;
+
+		__ip_vs_get_timeouts(&t);
+		if (copy_to_user(user, &t, sizeof(t)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	case IP_VS_SO_GET_DAEMON:
+	{
+		struct ip_vs_daemon_user d[2];
+
+		memset(&d, 0, sizeof(d));
+		if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+			d[0].state = IP_VS_STATE_MASTER;
+			strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
+			d[0].syncid = ip_vs_master_syncid;
+		}
+		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+			d[1].state = IP_VS_STATE_BACKUP;
+			strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
+			d[1].syncid = ip_vs_backup_syncid;
+		}
+		if (copy_to_user(user, &d, sizeof(d)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+  out:
+	mutex_unlock(&__ip_vs_mutex);
+	return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= IP_VS_BASE_CTL,
+	.set_optmax	= IP_VS_SO_SET_MAX+1,
+	.set		= do_ip_vs_set_ctl,
+	.get_optmin	= IP_VS_BASE_CTL,
+	.get_optmax	= IP_VS_SO_GET_MAX+1,
+	.get		= do_ip_vs_get_ctl,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Generic Netlink interface
+ */
+
+/* IPVS genetlink family */
+static struct genl_family ip_vs_genl_family = {
+	.id		= GENL_ID_GENERATE,
+	.hdrsize	= 0,
+	.name		= IPVS_GENL_NAME,
+	.version	= IPVS_GENL_VERSION,
+	.maxattr	= IPVS_CMD_MAX,
+};
+
+/* Policy used for first-level command attributes */
+static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
+	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
+	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
+	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
+static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
+	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
+	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_IFNAME_MAXLEN },
+	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
+static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
+	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
+					    .len = sizeof(union nf_inet_addr) },
+	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_SCHEDNAME_MAXLEN },
+	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
+					    .len = sizeof(struct ip_vs_flags) },
+	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
+static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
+	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
+					    .len = sizeof(union nf_inet_addr) },
+	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
+	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
+};
+
+static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
+				 struct ip_vs_stats *stats)
+{
+	struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+	if (!nl_stats)
+		return -EMSGSIZE;
+
+	spin_lock_bh(&stats->lock);
+
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
+
+	spin_unlock_bh(&stats->lock);
+
+	nla_nest_end(skb, nl_stats);
+
+	return 0;
+
+nla_put_failure:
+	spin_unlock_bh(&stats->lock);
+	nla_nest_cancel(skb, nl_stats);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc)
+{
+	struct nlattr *nl_service;
+	struct ip_vs_flags flags = { .flags = svc->flags,
+				     .mask = ~0 };
+
+	nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+	if (!nl_service)
+		return -EMSGSIZE;
+
+	NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
+
+	if (svc->fwmark) {
+		NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
+	} else {
+		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
+		NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
+		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
+	}
+
+	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
+	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
+	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
+
+	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nl_service);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_service);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc,
+				   struct netlink_callback *cb)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_SERVICE);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_service(skb, svc) < 0)
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	int idx = 0, i;
+	int start = cb->args[0];
+	struct ip_vs_service *svc;
+
+	mutex_lock(&__ip_vs_mutex);
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+			if (++idx <= start)
+				continue;
+			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+				idx--;
+				goto nla_put_failure;
+			}
+		}
+	}
+
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+			if (++idx <= start)
+				continue;
+			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+				idx--;
+				goto nla_put_failure;
+			}
+		}
+	}
+
+nla_put_failure:
+	mutex_unlock(&__ip_vs_mutex);
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
+				    struct nlattr *nla, int full_entry)
+{
+	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
+	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+
+	/* Parse mandatory identifying service fields first */
+	if (nla == NULL ||
+	    nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
+		return -EINVAL;
+
+	nla_af		= attrs[IPVS_SVC_ATTR_AF];
+	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
+	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
+	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
+	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];
+
+	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
+		return -EINVAL;
+
+	usvc->af = nla_get_u16(nla_af);
+#ifdef CONFIG_IP_VS_IPV6
+	if (usvc->af != AF_INET && usvc->af != AF_INET6)
+#else
+	if (usvc->af != AF_INET)
+#endif
+		return -EAFNOSUPPORT;
+
+	if (nla_fwmark) {
+		usvc->protocol = IPPROTO_TCP;
+		usvc->fwmark = nla_get_u32(nla_fwmark);
+	} else {
+		usvc->protocol = nla_get_u16(nla_protocol);
+		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
+		usvc->port = nla_get_u16(nla_port);
+		usvc->fwmark = 0;
+	}
+
+	/* If a full entry was requested, check for the additional fields */
+	if (full_entry) {
+		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+			      *nla_netmask;
+		struct ip_vs_flags flags;
+		struct ip_vs_service *svc;
+
+		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
+		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
+		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
+
+		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
+			return -EINVAL;
+
+		nla_memcpy(&flags, nla_flags, sizeof(flags));
+
+		/* prefill flags from service if it already exists */
+		if (usvc->fwmark)
+			svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
+		else
+			svc = __ip_vs_service_get(usvc->af, usvc->protocol,
+						  &usvc->addr, usvc->port);
+		if (svc) {
+			usvc->flags = svc->flags;
+			ip_vs_service_put(svc);
+		} else
+			usvc->flags = 0;
+
+		/* set new flags from userland */
+		usvc->flags = (usvc->flags & ~flags.mask) |
+			      (flags.flags & flags.mask);
+		usvc->sched_name = nla_data(nla_sched);
+		usvc->timeout = nla_get_u32(nla_timeout);
+		usvc->netmask = nla_get_u32(nla_netmask);
+	}
+
+	return 0;
+}
+
+static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+{
+	struct ip_vs_service_user_kern usvc;
+	int ret;
+
+	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (usvc.fwmark)
+		return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+	else
+		return __ip_vs_service_get(usvc.af, usvc.protocol,
+					   &usvc.addr, usvc.port);
+}
+
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+{
+	struct nlattr *nl_dest;
+
+	nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+	if (!nl_dest)
+		return -EMSGSIZE;
+
+	NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
+	NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
+
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+		    atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+		    atomic_read(&dest->activeconns));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+		    atomic_read(&dest->inactconns));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+		    atomic_read(&dest->persistconns));
+
+	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nl_dest);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_dest);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
+				struct netlink_callback *cb)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_DEST);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_dest(skb, dest) < 0)
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dests(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	int idx = 0;
+	int start = cb->args[0];
+	struct ip_vs_service *svc;
+	struct ip_vs_dest *dest;
+	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+
+	mutex_lock(&__ip_vs_mutex);
+
+	/* Try to find the service for which to dump destinations */
+	if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
+			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
+		goto out_err;
+
+	svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+	if (IS_ERR(svc) || svc == NULL)
+		goto out_err;
+
+	/* Dump the destinations */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (++idx <= start)
+			continue;
+		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+			idx--;
+			goto nla_put_failure;
+		}
+	}
+
+nla_put_failure:
+	cb->args[0] = idx;
+	ip_vs_service_put(svc);
+
+out_err:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
+				 struct nlattr *nla, int full_entry)
+{
+	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
+	struct nlattr *nla_addr, *nla_port;
+
+	/* Parse mandatory identifying destination fields first */
+	if (nla == NULL ||
+	    nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
+		return -EINVAL;
+
+	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
+	nla_port	= attrs[IPVS_DEST_ATTR_PORT];
+
+	if (!(nla_addr && nla_port))
+		return -EINVAL;
+
+	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
+	udest->port = nla_get_u16(nla_port);
+
+	/* If a full entry was requested, check for the additional fields */
+	if (full_entry) {
+		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
+			      *nla_l_thresh;
+
+		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
+		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
+		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
+		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];
+
+		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
+			return -EINVAL;
+
+		udest->conn_flags = nla_get_u32(nla_fwd)
+				    & IP_VS_CONN_F_FWD_MASK;
+		udest->weight = nla_get_u32(nla_weight);
+		udest->u_threshold = nla_get_u32(nla_u_thresh);
+		udest->l_threshold = nla_get_u32(nla_l_thresh);
+	}
+
+	return 0;
+}
+
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
+				  const char *mcast_ifn, __be32 syncid)
+{
+	struct nlattr *nl_daemon;
+
+	nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+	if (!nl_daemon)
+		return -EMSGSIZE;
+
+	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
+	NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
+	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
+
+	nla_nest_end(skb, nl_daemon);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_daemon);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
+				  const char *mcast_ifn, __be32 syncid,
+				  struct netlink_callback *cb)
+{
+	void *hdr;
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_DAEMON);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	mutex_lock(&__ip_vs_mutex);
+	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
+					   ip_vs_master_mcast_ifn,
+					   ip_vs_master_syncid, cb) < 0)
+			goto nla_put_failure;
+
+		cb->args[0] = 1;
+	}
+
+	if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
+					   ip_vs_backup_mcast_ifn,
+					   ip_vs_backup_syncid, cb) < 0)
+			goto nla_put_failure;
+
+		cb->args[1] = 1;
+	}
+
+nla_put_failure:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
+static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+{
+	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
+	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
+	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
+		return -EINVAL;
+
+	return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+}
+
+static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+{
+	if (!attrs[IPVS_DAEMON_ATTR_STATE])
+		return -EINVAL;
+
+	return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+}
+
+static int ip_vs_genl_set_config(struct nlattr **attrs)
+{
+	struct ip_vs_timeout_user t;
+
+	__ip_vs_get_timeouts(&t);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
+		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
+		t.tcp_fin_timeout =
+			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
+		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
+
+	return ip_vs_set_timeout(&t);
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ip_vs_service *svc = NULL;
+	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_dest_user_kern udest;
+	int ret = 0, cmd;
+	int need_full_svc = 0, need_full_dest = 0;
+
+	cmd = info->genlhdr->cmd;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	if (cmd == IPVS_CMD_FLUSH) {
+		ret = ip_vs_flush();
+		goto out;
+	} else if (cmd == IPVS_CMD_SET_CONFIG) {
+		ret = ip_vs_genl_set_config(info->attrs);
+		goto out;
+	} else if (cmd == IPVS_CMD_NEW_DAEMON ||
+		   cmd == IPVS_CMD_DEL_DAEMON) {
+
+		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+
+		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
+		    nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
+				     info->attrs[IPVS_CMD_ATTR_DAEMON],
+				     ip_vs_daemon_policy)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (cmd == IPVS_CMD_NEW_DAEMON)
+			ret = ip_vs_genl_new_daemon(daemon_attrs);
+		else
+			ret = ip_vs_genl_del_daemon(daemon_attrs);
+		goto out;
+	} else if (cmd == IPVS_CMD_ZERO &&
+		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
+		ret = ip_vs_zero_all();
+		goto out;
+	}
+
+	/* All following commands require a service argument, so check if we
+	 * received a valid one. We need a full service specification when
+	 * adding / editing a service. Only identifying members otherwise. */
+	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
+		need_full_svc = 1;
+
+	ret = ip_vs_genl_parse_service(&usvc,
+				       info->attrs[IPVS_CMD_ATTR_SERVICE],
+				       need_full_svc);
+	if (ret)
+		goto out;
+
+	/* Lookup the exact service by <protocol, addr, port> or fwmark */
+	if (usvc.fwmark == 0)
+		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
+					  &usvc.addr, usvc.port);
+	else
+		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+
+	/* Unless we're adding a new service, the service must already exist */
+	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
+		ret = -ESRCH;
+		goto out;
+	}
+
+	/* Destination commands require a valid destination argument. For
+	 * adding / editing a destination, we need a full destination
+	 * specification. */
+	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
+	    cmd == IPVS_CMD_DEL_DEST) {
+		if (cmd != IPVS_CMD_DEL_DEST)
+			need_full_dest = 1;
+
+		ret = ip_vs_genl_parse_dest(&udest,
+					    info->attrs[IPVS_CMD_ATTR_DEST],
+					    need_full_dest);
+		if (ret)
+			goto out;
+	}
+
+	switch (cmd) {
+	case IPVS_CMD_NEW_SERVICE:
+		if (svc == NULL)
+			ret = ip_vs_add_service(&usvc, &svc);
+		else
+			ret = -EEXIST;
+		break;
+	case IPVS_CMD_SET_SERVICE:
+		ret = ip_vs_edit_service(svc, &usvc);
+		break;
+	case IPVS_CMD_DEL_SERVICE:
+		ret = ip_vs_del_service(svc);
+		break;
+	case IPVS_CMD_NEW_DEST:
+		ret = ip_vs_add_dest(svc, &udest);
+		break;
+	case IPVS_CMD_SET_DEST:
+		ret = ip_vs_edit_dest(svc, &udest);
+		break;
+	case IPVS_CMD_DEL_DEST:
+		ret = ip_vs_del_dest(svc, &udest);
+		break;
+	case IPVS_CMD_ZERO:
+		ret = ip_vs_zero_service(svc);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+out:
+	if (svc)
+		ip_vs_service_put(svc);
+	mutex_unlock(&__ip_vs_mutex);
+
+	return ret;
+}
+
+static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	void *reply;
+	int ret, cmd, reply_cmd;
+
+	cmd = info->genlhdr->cmd;
+
+	if (cmd == IPVS_CMD_GET_SERVICE)
+		reply_cmd = IPVS_CMD_NEW_SERVICE;
+	else if (cmd == IPVS_CMD_GET_INFO)
+		reply_cmd = IPVS_CMD_SET_INFO;
+	else if (cmd == IPVS_CMD_GET_CONFIG)
+		reply_cmd = IPVS_CMD_SET_CONFIG;
+	else {
+		IP_VS_ERR("unknown Generic Netlink command\n");
+		return -EINVAL;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
+	if (reply == NULL)
+		goto nla_put_failure;
+
+	switch (cmd) {
+	case IPVS_CMD_GET_SERVICE:
+	{
+		struct ip_vs_service *svc;
+
+		svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+		if (IS_ERR(svc)) {
+			ret = PTR_ERR(svc);
+			goto out_err;
+		} else if (svc) {
+			ret = ip_vs_genl_fill_service(msg, svc);
+			ip_vs_service_put(svc);
+			if (ret)
+				goto nla_put_failure;
+		} else {
+			ret = -ESRCH;
+			goto out_err;
+		}
+
+		break;
+	}
+
+	case IPVS_CMD_GET_CONFIG:
+	{
+		struct ip_vs_timeout_user t;
+
+		__ip_vs_get_timeouts(&t);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+			    t.tcp_fin_timeout);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
+#endif
+
+		break;
+	}
+
+	case IPVS_CMD_GET_INFO:
+		NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
+		NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+			    IP_VS_CONN_TAB_SIZE);
+		break;
+	}
+
+	genlmsg_end(msg, reply);
+	ret = genlmsg_unicast(msg, info->snd_pid);
+	goto out;
+
+nla_put_failure:
+	IP_VS_ERR("not enough space in Netlink message\n");
+	ret = -EMSGSIZE;
+
+out_err:
+	nlmsg_free(msg);
+out:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return ret;
+}
+
+
+static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+	{
+		.cmd	= IPVS_CMD_NEW_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+		.dumpit	= ip_vs_genl_dump_services,
+		.policy	= ip_vs_cmd_policy,
+	},
+	{
+		.cmd	= IPVS_CMD_NEW_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.dumpit	= ip_vs_genl_dump_dests,
+	},
+	{
+		.cmd	= IPVS_CMD_NEW_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.dumpit	= ip_vs_genl_dump_daemons,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_CONFIG,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_CONFIG,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_INFO,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_ZERO,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_FLUSH,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+};
+
+static int __init ip_vs_genl_register(void)
+{
+	int ret, i;
+
+	ret = genl_register_family(&ip_vs_genl_family);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
+		ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
+		if (ret)
+			goto err_out;
+	}
+	return 0;
+
+err_out:
+	genl_unregister_family(&ip_vs_genl_family);
+	return ret;
+}
+
+static void ip_vs_genl_unregister(void)
+{
+	genl_unregister_family(&ip_vs_genl_family);
+}
+
+/* End of Generic Netlink interface definitions */
+
+
+int __init ip_vs_control_init(void)
+{
+	int ret;
+	int idx;
+
+	EnterFunction(2);
+
+	ret = nf_register_sockopt(&ip_vs_sockopts);
+	if (ret) {
+		IP_VS_ERR("cannot register sockopt.\n");
+		return ret;
+	}
+
+	ret = ip_vs_genl_register();
+	if (ret) {
+		IP_VS_ERR("cannot register Generic Netlink interface.\n");
+		nf_unregister_sockopt(&ip_vs_sockopts);
+		return ret;
+	}
+
+	proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
+	proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
+
+	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
+
+	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+	}
+	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+	}
+
+	ip_vs_new_estimator(&ip_vs_stats);
+
+	/* Hook the defense timer */
+	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+
+	LeaveFunction(2);
+	return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+	EnterFunction(2);
+	ip_vs_trash_cleanup();
+	cancel_rearming_delayed_work(&defense_work);
+	cancel_work_sync(&defense_work.work);
+	ip_vs_kill_estimator(&ip_vs_stats);
+	unregister_sysctl_table(sysctl_header);
+	proc_net_remove(&init_net, "ip_vs_stats");
+	proc_net_remove(&init_net, "ip_vs");
+	ip_vs_genl_unregister();
+	nf_unregister_sockopt(&ip_vs_sockopts);
+	LeaveFunction(2);
+}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
new file mode 100644
index 00000000000..a16943fd72f
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -0,0 +1,261 @@
+/*
+ * IPVS:        Destination Hashing scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              Inspired by the consistent hashing scheduler patch from
+ *              Thomas Proell <proellt@gmx.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[dest_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) OR (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS        8
+#endif
+#define IP_VS_DH_TAB_BITS               CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
+
+
+/*
+ *	Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(__be32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr)
+{
+	return (tbl[ip_vs_dh_hashkey(addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_dh_bucket *b;
+	struct list_head *p;
+	struct ip_vs_dest *dest;
+
+	b = tbl;
+	p = &svc->destinations;
+	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+		if (list_empty(p)) {
+			b->dest = NULL;
+		} else {
+			if (p == &svc->destinations)
+				p = p->next;
+
+			dest = list_entry(p, struct ip_vs_dest, n_list);
+			atomic_inc(&dest->refcnt);
+			b->dest = dest;
+
+			p = p->next;
+		}
+		b++;
+	}
+	return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+	int i;
+	struct ip_vs_dh_bucket *b;
+
+	b = tbl;
+	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+		if (b->dest) {
+			atomic_dec(&b->dest->refcnt);
+			b->dest = NULL;
+		}
+		b++;
+	}
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl;
+
+	/* allocate the DH table for this service */
+	tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+		      GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_dh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_dh_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+	return 0;
+}
+
+
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_dh_flush(tbl);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_dh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_dh_bucket *tbl;
+	struct iphdr *iph = ip_hdr(skb);
+
+	IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
+
+	tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+	dest = ip_vs_dh_get(tbl, iph->daddr);
+	if (!dest
+	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+	    || atomic_read(&dest->weight) <= 0
+	    || is_overloaded(dest)) {
+		return NULL;
+	}
+
+	IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(iph->daddr),
+		  NIPQUAD(dest->addr.ip),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+	.name =			"dh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
+	.init_service =		ip_vs_dh_init_svc,
+	.done_service =		ip_vs_dh_done_svc,
+	.update_service =	ip_vs_dh_update_svc,
+	.schedule =		ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
new file mode 100644
index 00000000000..2eb2860dabb
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -0,0 +1,166 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+
+#include <net/ip_vs.h>
+
+/*
+  This code is to estimate rate in a shorter interval (such as 8
+  seconds) for virtual services and real servers. For measure rate in a
+  long interval, it is easy to implement a user level daemon which
+  periodically reads those statistical counters and measure rate.
+
+  Currently, the measurement is activated by slow timer handler. Hope
+  this measurement will not introduce too much load.
+
+  We measure rate during the last 8 seconds every 2 seconds:
+
+    avgrate = avgrate*(1-W) + rate*W
+
+    where W = 2^(-2)
+
+  NOTES.
+
+  * The stored value for average bps is scaled by 2^5, so that maximal
+    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+
+  * A lot code is taken from net/sched/estimator.c
+ */
+
+
+static void estimation_timer(unsigned long arg);
+
+static LIST_HEAD(est_list);
+static DEFINE_SPINLOCK(est_lock);
+static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
+
+static void estimation_timer(unsigned long arg)
+{
+	struct ip_vs_estimator *e;
+	struct ip_vs_stats *s;
+	u32 n_conns;
+	u32 n_inpkts, n_outpkts;
+	u64 n_inbytes, n_outbytes;
+	u32 rate;
+
+	spin_lock(&est_lock);
+	list_for_each_entry(e, &est_list, list) {
+		s = container_of(e, struct ip_vs_stats, est);
+
+		spin_lock(&s->lock);
+		n_conns = s->ustats.conns;
+		n_inpkts = s->ustats.inpkts;
+		n_outpkts = s->ustats.outpkts;
+		n_inbytes = s->ustats.inbytes;
+		n_outbytes = s->ustats.outbytes;
+
+		/* scaled by 2^10, but divided 2 seconds */
+		rate = (n_conns - e->last_conns)<<9;
+		e->last_conns = n_conns;
+		e->cps += ((long)rate - (long)e->cps)>>2;
+		s->ustats.cps = (e->cps+0x1FF)>>10;
+
+		rate = (n_inpkts - e->last_inpkts)<<9;
+		e->last_inpkts = n_inpkts;
+		e->inpps += ((long)rate - (long)e->inpps)>>2;
+		s->ustats.inpps = (e->inpps+0x1FF)>>10;
+
+		rate = (n_outpkts - e->last_outpkts)<<9;
+		e->last_outpkts = n_outpkts;
+		e->outpps += ((long)rate - (long)e->outpps)>>2;
+		s->ustats.outpps = (e->outpps+0x1FF)>>10;
+
+		rate = (n_inbytes - e->last_inbytes)<<4;
+		e->last_inbytes = n_inbytes;
+		e->inbps += ((long)rate - (long)e->inbps)>>2;
+		s->ustats.inbps = (e->inbps+0xF)>>5;
+
+		rate = (n_outbytes - e->last_outbytes)<<4;
+		e->last_outbytes = n_outbytes;
+		e->outbps += ((long)rate - (long)e->outbps)>>2;
+		s->ustats.outbps = (e->outbps+0xF)>>5;
+		spin_unlock(&s->lock);
+	}
+	spin_unlock(&est_lock);
+	mod_timer(&est_timer, jiffies + 2*HZ);
+}
+
+void ip_vs_new_estimator(struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *est = &stats->est;
+
+	INIT_LIST_HEAD(&est->list);
+
+	est->last_conns = stats->ustats.conns;
+	est->cps = stats->ustats.cps<<10;
+
+	est->last_inpkts = stats->ustats.inpkts;
+	est->inpps = stats->ustats.inpps<<10;
+
+	est->last_outpkts = stats->ustats.outpkts;
+	est->outpps = stats->ustats.outpps<<10;
+
+	est->last_inbytes = stats->ustats.inbytes;
+	est->inbps = stats->ustats.inbps<<5;
+
+	est->last_outbytes = stats->ustats.outbytes;
+	est->outbps = stats->ustats.outbps<<5;
+
+	spin_lock_bh(&est_lock);
+	list_add(&est->list, &est_list);
+	spin_unlock_bh(&est_lock);
+}
+
+void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *est = &stats->est;
+
+	spin_lock_bh(&est_lock);
+	list_del(&est->list);
+	spin_unlock_bh(&est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *est = &stats->est;
+
+	/* set counters zero, caller must hold the stats->lock lock */
+	est->last_inbytes = 0;
+	est->last_outbytes = 0;
+	est->last_conns = 0;
+	est->last_inpkts = 0;
+	est->last_outpkts = 0;
+	est->cps = 0;
+	est->inpps = 0;
+	est->outpps = 0;
+	est->inbps = 0;
+	est->outbps = 0;
+}
+
+int __init ip_vs_estimator_init(void)
+{
+	mod_timer(&est_timer, jiffies + 2 * HZ);
+	return 0;
+}
+
+void ip_vs_estimator_cleanup(void)
+{
+	del_timer_sync(&est_timer);
+}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
new file mode 100644
index 00000000000..2e7dbd8b73a
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -0,0 +1,410 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Authors:	Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ *		IP_MASQ_FTP ftp masquerading module
+ *
+ * Version:	@(#)ip_masq_ftp.c 0.04   02/05/96
+ *
+ * Author:	Wouter Gadeyne
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/unaligned.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, ushort, NULL, 0);
+MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
+
+
+/*	Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+	return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+				  const char *pattern, size_t plen, char term,
+				  __be32 *addr, __be16 *port,
+				  char **start, char **end)
+{
+	unsigned char p[6];
+	int i = 0;
+
+	if (data_limit - data < plen) {
+		/* check if there is partial match */
+		if (strnicmp(data, pattern, data_limit - data) == 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	if (strnicmp(data, pattern, plen) != 0) {
+		return 0;
+	}
+	*start = data + plen;
+
+	for (data = *start; *data != term; data++) {
+		if (data == data_limit)
+			return -1;
+	}
+	*end = data;
+
+	memset(p, 0, sizeof(p));
+	for (data = *start; data != *end; data++) {
+		if (*data >= '0' && *data <= '9') {
+			p[i] = p[i]*10 + *data - '0';
+		} else if (*data == ',' && i < 5) {
+			i++;
+		} else {
+			/* unexpected character */
+			return -1;
+		}
+	}
+
+	if (i != 5)
+		return -1;
+
+	*addr = get_unaligned((__be32 *)p);
+	*port = get_unaligned((__be16 *)(p + 4));
+	return 1;
+}
+
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port.  Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+			 struct sk_buff *skb, int *diff)
+{
+	struct iphdr *iph;
+	struct tcphdr *th;
+	char *data, *data_limit;
+	char *start, *end;
+	union nf_inet_addr from;
+	__be16 port;
+	struct ip_vs_conn *n_cp;
+	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+	unsigned buf_len;
+	int ret;
+
+#ifdef CONFIG_IP_VS_IPV6
+	/* This application helper doesn't work with IPv6 yet,
+	 * so turn this into a no-op for IPv6 packets
+	 */
+	if (cp->af == AF_INET6)
+		return 1;
+#endif
+
+	*diff = 0;
+
+	/* Only useful for established sessions */
+	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+		return 1;
+
+	/* Linear packets are much easier to deal with. */
+	if (!skb_make_writable(skb, skb->len))
+		return 0;
+
+	if (cp->app_data == &ip_vs_ftp_pasv) {
+		iph = ip_hdr(skb);
+		th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+		data = (char *)th + (th->doff << 2);
+		data_limit = skb_tail_pointer(skb);
+
+		if (ip_vs_ftp_get_addrport(data, data_limit,
+					   SERVER_STRING,
+					   sizeof(SERVER_STRING)-1, ')',
+					   &from.ip, &port,
+					   &start, &end) != 1)
+			return 1;
+
+		IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
+			  "%u.%u.%u.%u:%d detected\n",
+			  NIPQUAD(from.ip), ntohs(port),
+			  NIPQUAD(cp->caddr.ip), 0);
+
+		/*
+		 * Now update or create an connection entry for it
+		 */
+		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
+					  &cp->caddr, 0);
+		if (!n_cp) {
+			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+					      &cp->caddr, 0,
+					      &cp->vaddr, port,
+					      &from, port,
+					      IP_VS_CONN_F_NO_CPORT,
+					      cp->dest);
+			if (!n_cp)
+				return 0;
+
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
+
+		/*
+		 * Replace the old passive address with the new one
+		 */
+		from.ip = n_cp->vaddr.ip;
+		port = n_cp->vport;
+		sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
+			(ntohs(port)>>8)&255, ntohs(port)&255);
+		buf_len = strlen(buf);
+
+		/*
+		 * Calculate required delta-offset to keep TCP happy
+		 */
+		*diff = buf_len - (end-start);
+
+		if (*diff == 0) {
+			/* simply replace it with new passive address */
+			memcpy(start, buf, buf_len);
+			ret = 1;
+		} else {
+			ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
+					  end-start, buf, buf_len);
+		}
+
+		cp->app_data = NULL;
+		ip_vs_tcp_conn_listen(n_cp);
+		ip_vs_conn_put(n_cp);
+		return ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+			struct sk_buff *skb, int *diff)
+{
+	struct iphdr *iph;
+	struct tcphdr *th;
+	char *data, *data_start, *data_limit;
+	char *start, *end;
+	union nf_inet_addr to;
+	__be16 port;
+	struct ip_vs_conn *n_cp;
+
+#ifdef CONFIG_IP_VS_IPV6
+	/* This application helper doesn't work with IPv6 yet,
+	 * so turn this into a no-op for IPv6 packets
+	 */
+	if (cp->af == AF_INET6)
+		return 1;
+#endif
+
+	/* no diff required for incoming packets */
+	*diff = 0;
+
+	/* Only useful for established sessions */
+	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+		return 1;
+
+	/* Linear packets are much easier to deal with. */
+	if (!skb_make_writable(skb, skb->len))
+		return 0;
+
+	/*
+	 * Detecting whether it is passive
+	 */
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+	/* Since there may be OPTIONS in the TCP packet and the HLEN is
+	   the length of the header in 32-bit multiples, it is accurate
+	   to calculate data address by th+HLEN*4 */
+	data = data_start = (char *)th + (th->doff << 2);
+	data_limit = skb_tail_pointer(skb);
+
+	while (data <= data_limit - 6) {
+		if (strnicmp(data, "PASV\r\n", 6) == 0) {
+			/* Passive mode on */
+			IP_VS_DBG(7, "got PASV at %td of %td\n",
+				  data - data_start,
+				  data_limit - data_start);
+			cp->app_data = &ip_vs_ftp_pasv;
+			return 1;
+		}
+		data++;
+	}
+
+	/*
+	 * To support virtual FTP server, the scenerio is as follows:
+	 *       FTP client ----> Load Balancer ----> FTP server
+	 * First detect the port number in the application data,
+	 * then create a new connection entry for the coming data
+	 * connection.
+	 */
+	if (ip_vs_ftp_get_addrport(data_start, data_limit,
+				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+				   '\r', &to.ip, &port,
+				   &start, &end) != 1)
+		return 1;
+
+	IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
+		  NIPQUAD(to.ip), ntohs(port));
+
+	/* Passive mode off */
+	cp->app_data = NULL;
+
+	/*
+	 * Now update or create a connection entry for it
+	 */
+	IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
+		  ip_vs_proto_name(iph->protocol),
+		  NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
+
+	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
+				 &to, port,
+				 &cp->vaddr, htons(ntohs(cp->vport)-1));
+	if (!n_cp) {
+		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+				      &to, port,
+				      &cp->vaddr, htons(ntohs(cp->vport)-1),
+				      &cp->daddr, htons(ntohs(cp->dport)-1),
+				      0,
+				      cp->dest);
+		if (!n_cp)
+			return 0;
+
+		/* add its controller */
+		ip_vs_control_add(n_cp, cp);
+	}
+
+	/*
+	 *	Move tunnel to listen state
+	 */
+	ip_vs_tcp_conn_listen(n_cp);
+	ip_vs_conn_put(n_cp);
+
+	return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+	.name =		"ftp",
+	.type =		IP_VS_APP_TYPE_FTP,
+	.protocol =	IPPROTO_TCP,
+	.module =	THIS_MODULE,
+	.incs_list =	LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+	.init_conn =	ip_vs_ftp_init_conn,
+	.done_conn =	ip_vs_ftp_done_conn,
+	.bind_conn =	NULL,
+	.unbind_conn =	NULL,
+	.pkt_out =	ip_vs_ftp_out,
+	.pkt_in =	ip_vs_ftp_in,
+};
+
+
+/*
+ *	ip_vs_ftp initialization
+ */
+static int __init ip_vs_ftp_init(void)
+{
+	int i, ret;
+	struct ip_vs_app *app = &ip_vs_ftp;
+
+	ret = register_ip_vs_app(app);
+	if (ret)
+		return ret;
+
+	for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+		if (!ports[i])
+			continue;
+		ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+		if (ret)
+			break;
+		IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
+			   app->name, i, ports[i]);
+	}
+
+	if (ret)
+		unregister_ip_vs_app(app);
+
+	return ret;
+}
+
+
+/*
+ *	ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+	unregister_ip_vs_app(&ip_vs_ftp);
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
new file mode 100644
index 00000000000..6ecef3518ca
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -0,0 +1,555 @@
+/*
+ * IPVS:        Locality-Based Least-Connection scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Martin Hamilton         :    fixed the terrible locking bugs
+ *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
+ *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
+ *     Wensong Zhang           :    added doing full expiration check to
+ *                                   collect stale entries of 24+ hours when
+ *                                   no partial expire check in a half hour
+ *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
+ *                                   to avoid the possible race between timer
+ *                                   handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ *       if cachenode[dest_ip] is null then
+ *               n, cachenode[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- cachenode[dest_ip];
+ *               if (n is dead) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                 n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ *       return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblc entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
+
+
+/*
+ *     for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS      10
+#endif
+#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS lblc entry represents an association between destination
+ *      IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+	struct list_head        list;
+	__be32                  addr;           /* destination IP address */
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+	unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+	atomic_t                entries;        /* number of entries */
+	int                     max_size;       /* maximum size of entries */
+	struct timer_list       periodic_timer; /* collect stale entries */
+	int                     rover;          /* rover for expire check */
+	int                     counter;        /* counter for no expire */
+};
+
+
+/*
+ *      IPVS LBLC sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+	{
+		.procname	= "lblc_expiration",
+		.data		= &sysctl_ip_vs_lblc_expiration,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+	list_del(&en->list);
+	/*
+	 * We don't kfree dest because it is refered either by its service
+	 * or the trash dest list.
+	 */
+	atomic_dec(&en->dest->refcnt);
+	kfree(en);
+}
+
+
+/*
+ *	Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ *	Hash an entry in the ip_vs_lblc_table.
+ *	returns bool success.
+ */
+static void
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+	unsigned hash = ip_vs_lblc_hashkey(en->addr);
+
+	list_add(&en->list, &tbl->bucket[hash]);
+	atomic_inc(&tbl->entries);
+}
+
+
+/*
+ *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read
+ *  lock
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
+{
+	unsigned hash = ip_vs_lblc_hashkey(addr);
+	struct ip_vs_lblc_entry *en;
+
+	list_for_each_entry(en, &tbl->bucket[hash], list)
+		if (en->addr == addr)
+			return en;
+
+	return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
+ * address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
+	       struct ip_vs_dest *dest)
+{
+	struct ip_vs_lblc_entry *en;
+
+	en = ip_vs_lblc_get(tbl, daddr);
+	if (!en) {
+		en = kmalloc(sizeof(*en), GFP_ATOMIC);
+		if (!en) {
+			IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+			return NULL;
+		}
+
+		en->addr = daddr;
+		en->lastuse = jiffies;
+
+		atomic_inc(&dest->refcnt);
+		en->dest = dest;
+
+		ip_vs_lblc_hash(tbl, en);
+	} else if (en->dest != dest) {
+		atomic_dec(&en->dest->refcnt);
+		atomic_inc(&dest->refcnt);
+		en->dest = dest;
+	}
+
+	return en;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+	struct ip_vs_lblc_entry *en, *nxt;
+	int i;
+
+	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+		}
+	}
+}
+
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	struct ip_vs_lblc_entry *en, *nxt;
+	unsigned long now = jiffies;
+	int i, j;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now,
+					en->lastuse + sysctl_ip_vs_lblc_expiration))
+				continue;
+
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&svc->sched_lock);
+	}
+	tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblc table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+	struct ip_vs_service *svc = (struct ip_vs_service *) data;
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	unsigned long now = jiffies;
+	int goal;
+	int i, j;
+	struct ip_vs_lblc_entry *en, *nxt;
+
+	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+		/* do full expiration check */
+		ip_vs_lblc_full_check(svc);
+		tbl->counter = 1;
+		goto out;
+	}
+
+	if (atomic_read(&tbl->entries) <= tbl->max_size) {
+		tbl->counter++;
+		goto out;
+	}
+
+	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+	if (goal > tbl->max_size/2)
+		goal = tbl->max_size/2;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+				continue;
+
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+			goal--;
+		}
+		write_unlock(&svc->sched_lock);
+		if (goal <= 0)
+			break;
+	}
+	tbl->rover = j;
+
+  out:
+	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_lblc_table *tbl;
+
+	/*
+	 *    Allocate the ip_vs_lblc_table for this service
+	 */
+	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+		  "current service\n", sizeof(*tbl));
+
+	/*
+	 *    Initialize the hash buckets
+	 */
+	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		INIT_LIST_HEAD(&tbl->bucket[i]);
+	}
+	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+	tbl->rover = 0;
+	tbl->counter = 1;
+
+	/*
+	 *    Hook periodic timer for garbage collection
+	 */
+	setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
+			(unsigned long)svc);
+	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+	return 0;
+}
+
+
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+	/* remove periodic timer */
+	del_timer_sync(&tbl->periodic_timer);
+
+	/* got to clean up table entries here */
+	ip_vs_lblc_flush(tbl);
+
+	/* release the table itself */
+	kfree(tbl);
+	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+		  sizeof(*tbl));
+
+	return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	/*
+	 * We think the overhead of processing active connections is fifty
+	 * times higher than that of inactive connections in average. (This
+	 * fifty times might not be accurate, we will change it later.) We
+	 * use the following formula to estimate the overhead:
+	 *                dest->activeconns*50 + dest->inactconns
+	 * and the load:
+	 *                (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *                h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connection.
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		if (atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = atomic_read(&least->activeconns) * 50
+				+ atomic_read(&least->inactconns);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+		struct ip_vs_dest *d;
+
+		list_for_each_entry(d, &svc->destinations, n_list) {
+			if (atomic_read(&d->activeconns)*2
+			    < atomic_read(&d->weight)) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	struct iphdr *iph = ip_hdr(skb);
+	struct ip_vs_dest *dest = NULL;
+	struct ip_vs_lblc_entry *en;
+
+	IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
+
+	/* First look in our cache */
+	read_lock(&svc->sched_lock);
+	en = ip_vs_lblc_get(tbl, iph->daddr);
+	if (en) {
+		/* We only hold a read lock, but this is atomic */
+		en->lastuse = jiffies;
+
+		/*
+		 * If the destination is not available, i.e. it's in the trash,
+		 * we must ignore it, as it may be removed from under our feet,
+		 * if someone drops our reference count. Our caller only makes
+		 * sure that destinations, that are not in the trash, are not
+		 * moved to the trash, while we are scheduling. But anyone can
+		 * free up entries from the trash at any time.
+		 */
+
+		if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
+			dest = en->dest;
+	}
+	read_unlock(&svc->sched_lock);
+
+	/* If the destination has a weight and is not overloaded, use it */
+	if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+		goto out;
+
+	/* No cache entry or it is invalid, time to schedule */
+	dest = __ip_vs_lblc_schedule(svc, iph);
+	if (!dest) {
+		IP_VS_DBG(1, "no destination available\n");
+		return NULL;
+	}
+
+	/* If we fail to create a cache entry, we'll just use the valid dest */
+	write_lock(&svc->sched_lock);
+	ip_vs_lblc_new(tbl, iph->daddr, dest);
+	write_unlock(&svc->sched_lock);
+
+out:
+	IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(iph->daddr),
+		  NIPQUAD(dest->addr.ip),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+	.name =			"lblc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
+	.init_service =		ip_vs_lblc_init_svc,
+	.done_service =		ip_vs_lblc_done_svc,
+	.schedule =		ip_vs_lblc_schedule,
+};
+
+
+static int __init ip_vs_lblc_init(void)
+{
+	int ret;
+
+	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+	if (ret)
+		unregister_sysctl_table(sysctl_header);
+	return ret;
+}
+
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+	unregister_sysctl_table(sysctl_header);
+	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
new file mode 100644
index 00000000000..1f75ea83bcf
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,755 @@
+/*
+ * IPVS:        Locality-Based Least-Connection with Replication scheduler
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Julian Anastasov        :    Added the missing (dest->weight>0)
+ *                                  condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ *       if serverSet[dest_ip] is null then
+ *               n, serverSet[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- {least-conn (alive) node in serverSet[dest_ip]};
+ *               if (n is null) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                   n <- {weighted least-conn node};
+ *                   add n to serverSet[dest_ip];
+ *               if |serverSet[dest_ip]| > 1 AND
+ *                   now - serverSet[dest_ip].lastMod > T then
+ *                   m <- {most conn node in serverSet[dest_ip]};
+ *                   remove m from serverSet[dest_ip];
+ *       if serverSet[dest_ip] changed then
+ *               serverSet[dest_ip].lastMod <- now;
+ *
+ *       return n;
+ *
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblcr entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
+
+
+/*
+ *     for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
+#endif
+#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS destination set structure and operations
+ */
+struct ip_vs_dest_list {
+	struct ip_vs_dest_list  *next;          /* list link */
+	struct ip_vs_dest       *dest;          /* destination server */
+};
+
+struct ip_vs_dest_set {
+	atomic_t                size;           /* set size */
+	unsigned long           lastmod;        /* last modified time */
+	struct ip_vs_dest_list  *list;          /* destination list */
+	rwlock_t	        lock;           /* lock for this list */
+};
+
+
+static struct ip_vs_dest_list *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+	struct ip_vs_dest_list *e;
+
+	for (e=set->list; e!=NULL; e=e->next) {
+		if (e->dest == dest)
+			/* already existed */
+			return NULL;
+	}
+
+	e = kmalloc(sizeof(*e), GFP_ATOMIC);
+	if (e == NULL) {
+		IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
+		return NULL;
+	}
+
+	atomic_inc(&dest->refcnt);
+	e->dest = dest;
+
+	/* link it to the list */
+	e->next = set->list;
+	set->list = e;
+	atomic_inc(&set->size);
+
+	set->lastmod = jiffies;
+	return e;
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+	struct ip_vs_dest_list *e, **ep;
+
+	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+		if (e->dest == dest) {
+			/* HIT */
+			*ep = e->next;
+			atomic_dec(&set->size);
+			set->lastmod = jiffies;
+			atomic_dec(&e->dest->refcnt);
+			kfree(e);
+			break;
+		}
+		ep = &e->next;
+	}
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+	struct ip_vs_dest_list *e, **ep;
+
+	write_lock(&set->lock);
+	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+		*ep = e->next;
+		/*
+		 * We don't kfree dest because it is refered either
+		 * by its service or by the trash dest list.
+		 */
+		atomic_dec(&e->dest->refcnt);
+		kfree(e);
+	}
+	write_unlock(&set->lock);
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+	register struct ip_vs_dest_list *e;
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	if (set == NULL)
+		return NULL;
+
+	/* select the first destination server, whose weight > 0 */
+	for (e=set->list; e!=NULL; e=e->next) {
+		least = e->dest;
+		if (least->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		if ((atomic_read(&least->weight) > 0)
+		    && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+			loh = atomic_read(&least->activeconns) * 50
+				+ atomic_read(&least->inactconns);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/* find the destination with the weighted least load */
+  nextstage:
+	for (e=e->next; e!=NULL; e=e->next) {
+		dest = e->dest;
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		if ((loh * atomic_read(&dest->weight) >
+		     doh * atomic_read(&least->weight))
+		    && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+	return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+	register struct ip_vs_dest_list *e;
+	struct ip_vs_dest *dest, *most;
+	int moh, doh;
+
+	if (set == NULL)
+		return NULL;
+
+	/* select the first destination server, whose weight > 0 */
+	for (e=set->list; e!=NULL; e=e->next) {
+		most = e->dest;
+		if (atomic_read(&most->weight) > 0) {
+			moh = atomic_read(&most->activeconns) * 50
+				+ atomic_read(&most->inactconns);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/* find the destination with the weighted most load */
+  nextstage:
+	for (e=e->next; e!=NULL; e=e->next) {
+		dest = e->dest;
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+		if ((moh * atomic_read(&dest->weight) <
+		     doh * atomic_read(&most->weight))
+		    && (atomic_read(&dest->weight) > 0)) {
+			most = dest;
+			moh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(most->addr.ip), ntohs(most->port),
+		  atomic_read(&most->activeconns),
+		  atomic_read(&most->refcnt),
+		  atomic_read(&most->weight), moh);
+	return most;
+}
+
+
+/*
+ *      IPVS lblcr entry represents an association between destination
+ *      IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+	struct list_head        list;
+	__be32                   addr;           /* destination IP address */
+	struct ip_vs_dest_set   set;            /* destination server set */
+	unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
+	atomic_t                entries;        /* number of entries */
+	int                     max_size;       /* maximum size of entries */
+	struct timer_list       periodic_timer; /* collect stale entries */
+	int                     rover;          /* rover for expire check */
+	int                     counter;        /* counter for no expire */
+};
+
+
+/*
+ *      IPVS LBLCR sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+	{
+		.procname	= "lblcr_expiration",
+		.data		= &sysctl_ip_vs_lblcr_expiration,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+	list_del(&en->list);
+	ip_vs_dest_set_eraseall(&en->set);
+	kfree(en);
+}
+
+
+/*
+ *	Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ *	Hash an entry in the ip_vs_lblcr_table.
+ *	returns bool success.
+ */
+static void
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+	unsigned hash = ip_vs_lblcr_hashkey(en->addr);
+
+	list_add(&en->list, &tbl->bucket[hash]);
+	atomic_inc(&tbl->entries);
+}
+
+
+/*
+ *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
+ *  read lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
+{
+	unsigned hash = ip_vs_lblcr_hashkey(addr);
+	struct ip_vs_lblcr_entry *en;
+
+	list_for_each_entry(en, &tbl->bucket[hash], list)
+		if (en->addr == addr)
+			return en;
+
+	return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl,  __be32 daddr,
+		struct ip_vs_dest *dest)
+{
+	struct ip_vs_lblcr_entry *en;
+
+	en = ip_vs_lblcr_get(tbl, daddr);
+	if (!en) {
+		en = kmalloc(sizeof(*en), GFP_ATOMIC);
+		if (!en) {
+			IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+			return NULL;
+		}
+
+		en->addr = daddr;
+		en->lastuse = jiffies;
+
+		/* initilize its dest set */
+		atomic_set(&(en->set.size), 0);
+		en->set.list = NULL;
+		rwlock_init(&en->set.lock);
+
+		ip_vs_lblcr_hash(tbl, en);
+	}
+
+	write_lock(&en->set.lock);
+	ip_vs_dest_set_insert(&en->set, dest);
+	write_unlock(&en->set.lock);
+
+	return en;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+	int i;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	/* No locking required, only called during cleanup. */
+	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+			ip_vs_lblcr_free(en);
+		}
+	}
+}
+
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+	unsigned long now = jiffies;
+	int i, j;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
+				       now))
+				continue;
+
+			ip_vs_lblcr_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&svc->sched_lock);
+	}
+	tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblcr table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+	struct ip_vs_service *svc = (struct ip_vs_service *) data;
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+	unsigned long now = jiffies;
+	int goal;
+	int i, j;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+		/* do full expiration check */
+		ip_vs_lblcr_full_check(svc);
+		tbl->counter = 1;
+		goto out;
+	}
+
+	if (atomic_read(&tbl->entries) <= tbl->max_size) {
+		tbl->counter++;
+		goto out;
+	}
+
+	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+	if (goal > tbl->max_size/2)
+		goal = tbl->max_size/2;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+				continue;
+
+			ip_vs_lblcr_free(en);
+			atomic_dec(&tbl->entries);
+			goal--;
+		}
+		write_unlock(&svc->sched_lock);
+		if (goal <= 0)
+			break;
+	}
+	tbl->rover = j;
+
+  out:
+	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_lblcr_table *tbl;
+
+	/*
+	 *    Allocate the ip_vs_lblcr_table for this service
+	 */
+	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+		  "current service\n", sizeof(*tbl));
+
+	/*
+	 *    Initialize the hash buckets
+	 */
+	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		INIT_LIST_HEAD(&tbl->bucket[i]);
+	}
+	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+	tbl->rover = 0;
+	tbl->counter = 1;
+
+	/*
+	 *    Hook periodic timer for garbage collection
+	 */
+	setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
+			(unsigned long)svc);
+	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+	return 0;
+}
+
+
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+	/* remove periodic timer */
+	del_timer_sync(&tbl->periodic_timer);
+
+	/* got to clean up table entries here */
+	ip_vs_lblcr_flush(tbl);
+
+	/* release the table itself */
+	kfree(tbl);
+	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+		  sizeof(*tbl));
+
+	return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	/*
+	 * We think the overhead of processing active connections is fifty
+	 * times higher than that of inactive connections in average. (This
+	 * fifty times might not be accurate, we will change it later.) We
+	 * use the following formula to estimate the overhead:
+	 *                dest->activeconns*50 + dest->inactconns
+	 * and the load:
+	 *                (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *                h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connection.
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		if (atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = atomic_read(&least->activeconns) * 50
+				+ atomic_read(&least->inactconns);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr.ip), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+		struct ip_vs_dest *d;
+
+		list_for_each_entry(d, &svc->destinations, n_list) {
+			if (atomic_read(&d->activeconns)*2
+			    < atomic_read(&d->weight)) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+	struct iphdr *iph = ip_hdr(skb);
+	struct ip_vs_dest *dest = NULL;
+	struct ip_vs_lblcr_entry *en;
+
+	IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
+
+	/* First look in our cache */
+	read_lock(&svc->sched_lock);
+	en = ip_vs_lblcr_get(tbl, iph->daddr);
+	if (en) {
+		/* We only hold a read lock, but this is atomic */
+		en->lastuse = jiffies;
+
+		/* Get the least loaded destination */
+		read_lock(&en->set.lock);
+		dest = ip_vs_dest_set_min(&en->set);
+		read_unlock(&en->set.lock);
+
+		/* More than one destination + enough time passed by, cleanup */
+		if (atomic_read(&en->set.size) > 1 &&
+				time_after(jiffies, en->set.lastmod +
+				sysctl_ip_vs_lblcr_expiration)) {
+			struct ip_vs_dest *m;
+
+			write_lock(&en->set.lock);
+			m = ip_vs_dest_set_max(&en->set);
+			if (m)
+				ip_vs_dest_set_erase(&en->set, m);
+			write_unlock(&en->set.lock);
+		}
+
+		/* If the destination is not overloaded, use it */
+		if (dest && !is_overloaded(dest, svc)) {
+			read_unlock(&svc->sched_lock);
+			goto out;
+		}
+
+		/* The cache entry is invalid, time to schedule */
+		dest = __ip_vs_lblcr_schedule(svc, iph);
+		if (!dest) {
+			IP_VS_DBG(1, "no destination available\n");
+			read_unlock(&svc->sched_lock);
+			return NULL;
+		}
+
+		/* Update our cache entry */
+		write_lock(&en->set.lock);
+		ip_vs_dest_set_insert(&en->set, dest);
+		write_unlock(&en->set.lock);
+	}
+	read_unlock(&svc->sched_lock);
+
+	if (dest)
+		goto out;
+
+	/* No cache entry, time to schedule */
+	dest = __ip_vs_lblcr_schedule(svc, iph);
+	if (!dest) {
+		IP_VS_DBG(1, "no destination available\n");
+		return NULL;
+	}
+
+	/* If we fail to create a cache entry, we'll just use the valid dest */
+	write_lock(&svc->sched_lock);
+	ip_vs_lblcr_new(tbl, iph->daddr, dest);
+	write_unlock(&svc->sched_lock);
+
+out:
+	IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(iph->daddr),
+		  NIPQUAD(dest->addr.ip),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+	.name =			"lblcr",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
+	.init_service =		ip_vs_lblcr_init_svc,
+	.done_service =		ip_vs_lblcr_done_svc,
+	.schedule =		ip_vs_lblcr_schedule,
+};
+
+
+static int __init ip_vs_lblcr_init(void)
+{
+	int ret;
+
+	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+	ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+	if (ret)
+		unregister_sysctl_table(sysctl_header);
+	return ret;
+}
+
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+	unregister_sysctl_table(sysctl_header);
+	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
new file mode 100644
index 00000000000..b69f808ac46
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -0,0 +1,103 @@
+/*
+ * IPVS:        Least-Connection Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     added the ip_vs_lc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We think the overhead of processing active connections is 256
+	 * times higher than that of inactive connections in average. (This
+	 * 256 times might not be accurate, we will change it later) We
+	 * use the following formula to estimate the overhead now:
+	 *		  dest->activeconns*256 + dest->inactconns
+	 */
+	return (atomic_read(&dest->activeconns) << 8) +
+		atomic_read(&dest->inactconns);
+}
+
+
+/*
+ *	Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least = NULL;
+	unsigned int loh = 0, doh;
+
+	IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
+
+	/*
+	 * Simply select the server with the least number of
+	 *        (activeconns<<5) + inactconns
+	 * Except whose weight is equal to zero.
+	 * If the weight is equal to zero, it means that the server is
+	 * quiesced, the existing connections to the server still get
+	 * served, but no new connection is assigned to the server.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+		    atomic_read(&dest->weight) == 0)
+			continue;
+		doh = ip_vs_lc_dest_overhead(dest);
+		if (!least || doh < loh) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	if (least)
+	IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->inactconns));
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+	.name =			"lc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
+	.schedule =		ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
new file mode 100644
index 00000000000..9a2d8033f08
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -0,0 +1,138 @@
+/*
+ * IPVS:        Never Queue scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We only use the active connection number in the cost
+	 * calculation here.
+	 */
+	return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least = NULL;
+	unsigned int loh = 0, doh;
+
+	IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *	(server expected overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+		    !atomic_read(&dest->weight))
+			continue;
+
+		doh = ip_vs_nq_dest_overhead(dest);
+
+		/* return the server directly if it is idle */
+		if (atomic_read(&dest->activeconns) == 0) {
+			least = dest;
+			loh = doh;
+			goto out;
+		}
+
+		if (!least ||
+		    (loh * atomic_read(&dest->weight) >
+		     doh * atomic_read(&least->weight))) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	if (!least)
+		return NULL;
+
+  out:
+	IP_VS_DBG_BUF(6, "NQ: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+	.name =			"nq",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
+	.schedule =		ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
new file mode 100644
index 00000000000..0791f9e08fe
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -0,0 +1,288 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE		32	/* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto)		((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ *	register an ipvs protocol
+ */
+static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+	pp->next = ip_vs_proto_table[hash];
+	ip_vs_proto_table[hash] = pp;
+
+	if (pp->init != NULL)
+		pp->init(pp);
+
+	return 0;
+}
+
+
+/*
+ *	unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+	struct ip_vs_protocol **pp_p;
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+	pp_p = &ip_vs_proto_table[hash];
+	for (; *pp_p; pp_p = &(*pp_p)->next) {
+		if (*pp_p == pp) {
+			*pp_p = pp->next;
+			if (pp->exit != NULL)
+				pp->exit(pp);
+			return 0;
+		}
+	}
+
+	return -ESRCH;
+}
+
+
+/*
+ *	get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+	struct ip_vs_protocol *pp;
+	unsigned hash = IP_VS_PROTO_HASH(proto);
+
+	for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+		if (pp->protocol == proto)
+			return pp;
+	}
+
+	return NULL;
+}
+
+
+/*
+ *	Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(int flags)
+{
+	struct ip_vs_protocol *pp;
+	int i;
+
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
+			if (pp->timeout_change)
+				pp->timeout_change(pp, flags);
+		}
+	}
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+	return kmemdup(table, size, GFP_ATOMIC);
+}
+
+
+/*
+ *	Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
+{
+	int i;
+
+	if (!table || !name || !to)
+		return -EINVAL;
+
+	for (i = 0; i < num; i++) {
+		if (strcmp(names[i], name))
+			continue;
+		table[i] = to * HZ;
+		return 0;
+	}
+	return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+	if (pp == NULL || pp->state_name == NULL)
+		return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+	return pp->state_name(state);
+}
+
+
+static void
+ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
+			     const struct sk_buff *skb,
+			     int offset,
+			     const char *msg)
+{
+	char buf[128];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else if (ih->frag_off & htons(IP_OFFSET))
+		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+			pp->name, NIPQUAD(ih->saddr),
+			NIPQUAD(ih->daddr));
+	else {
+		__be16 _ports[2], *pptr
+;
+		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
+				pp->name,
+				NIPQUAD(ih->saddr),
+				NIPQUAD(ih->daddr));
+		else
+			sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
+				pp->name,
+				NIPQUAD(ih->saddr),
+				ntohs(pptr[0]),
+				NIPQUAD(ih->daddr),
+				ntohs(pptr[1]));
+	}
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
+			     const struct sk_buff *skb,
+			     int offset,
+			     const char *msg)
+{
+	char buf[192];
+	struct ipv6hdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else if (ih->nexthdr == IPPROTO_FRAGMENT)
+		sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
+			pp->name, NIP6(ih->saddr),
+			NIP6(ih->daddr));
+	else {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
+				pp->name,
+				NIP6(ih->saddr),
+				NIP6(ih->daddr));
+		else
+			sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
+				pp->name,
+				NIP6(ih->saddr),
+				ntohs(pptr[0]),
+				NIP6(ih->daddr),
+				ntohs(pptr[1]));
+	}
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+#endif
+
+
+void
+ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+			  const struct sk_buff *skb,
+			  int offset,
+			  const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (skb->protocol == htons(ETH_P_IPV6))
+		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
+	else
+#endif
+		ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+
+int __init ip_vs_protocol_init(void)
+{
+	char protocols[64];
+#define REGISTER_PROTOCOL(p)			\
+	do {					\
+		register_ip_vs_protocol(p);	\
+		strcat(protocols, ", ");	\
+		strcat(protocols, (p)->name);	\
+	} while (0)
+
+	protocols[0] = '\0';
+	protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+	REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+	REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+	IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
+
+	return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+	struct ip_vs_protocol *pp;
+	int i;
+
+	/* unregister all the ipvs protocols */
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		while ((pp = ip_vs_proto_table[i]) != NULL)
+			unregister_ip_vs_protocol(pp);
+	}
+}
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
new file mode 100644
index 00000000000..80ab0c8e5b4
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,235 @@
+/*
+ * ip_vs_proto_ah_esp.c:	AH/ESP IPSec load balancing support for IPVS
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
+ *		Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+	__u8		icookie[8];
+	__u8		rcookie[8];
+	__u8		np;
+	__u8		version;
+	__u8		xchgtype;
+	__u8		flags;
+	__u32		msgid;
+	__u32		length;
+};
+
+*/
+
+#define PORT_ISAKMP	500
+
+
+static struct ip_vs_conn *
+ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		   const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		   int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+				       &iph->saddr,
+				       htons(PORT_ISAKMP),
+				       &iph->daddr,
+				       htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+				       &iph->daddr,
+				       htons(PORT_ISAKMP),
+				       &iph->saddr,
+				       htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		/*
+		 * We are not sure if the packet is from our
+		 * service, so our conn_schedule hook should return NF_ACCEPT
+		 */
+		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
+			      "%s%s %s->%s\n",
+			      inverse ? "ICMP+" : "",
+			      pp->name,
+			      IP_VS_DBG_ADDR(af, &iph->saddr),
+			      IP_VS_DBG_ADDR(af, &iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_esp_conn_out_get(int af, const struct sk_buff *skb,
+		    struct ip_vs_protocol *pp,
+		    const struct ip_vs_iphdr *iph,
+		    unsigned int proto_off,
+		    int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+					&iph->saddr,
+					htons(PORT_ISAKMP),
+					&iph->daddr,
+					htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+					&iph->daddr,
+					htons(PORT_ISAKMP),
+					&iph->saddr,
+					htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
+			      "%s%s %s->%s\n",
+			      inverse ? "ICMP+" : "",
+			      pp->name,
+			      IP_VS_DBG_ADDR(af, &iph->saddr),
+			      IP_VS_DBG_ADDR(af, &iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static int
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+		     int *verdict, struct ip_vs_conn **cpp)
+{
+	/*
+	 * AH/ESP is only related traffic. Pass the packet to IP stack.
+	 */
+	*verdict = NF_ACCEPT;
+	return 0;
+}
+
+
+static void
+ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		       int offset, const char *msg)
+{
+	char buf[256];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else
+		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+			pp->name, NIPQUAD(ih->saddr),
+			NIPQUAD(ih->daddr));
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		       int offset, const char *msg)
+{
+	char buf[256];
+	struct ipv6hdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else
+		sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
+			pp->name, NIP6(ih->saddr),
+			NIP6(ih->daddr));
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+#endif
+
+static void
+ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		    int offset, const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (skb->protocol == htons(ETH_P_IPV6))
+		ah_esp_debug_packet_v6(pp, skb, offset, msg);
+	else
+#endif
+		ah_esp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+
+static void ah_esp_init(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+static void ah_esp_exit(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+#ifdef CONFIG_IP_VS_PROTO_AH
+struct ip_vs_protocol ip_vs_protocol_ah = {
+	.name =			"AH",
+	.protocol =		IPPROTO_AH,
+	.num_states =		1,
+	.dont_defrag =		1,
+	.init =			ah_esp_init,
+	.exit =			ah_esp_exit,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ah_esp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+	.set_state_timeout =	NULL,
+};
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+	.name =			"ESP",
+	.protocol =		IPPROTO_ESP,
+	.num_states =		1,
+	.dont_defrag =		1,
+	.init =			ah_esp_init,
+	.exit =			ah_esp_exit,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ah_esp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+};
+#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 00000000000..dd4566ea2bf
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,732 @@
+/*
+ * ip_vs_proto_tcp.c:	TCP load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		int inverse)
+{
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		return ip_vs_conn_in_get(af, iph->protocol,
+					 &iph->saddr, pptr[0],
+					 &iph->daddr, pptr[1]);
+	} else {
+		return ip_vs_conn_in_get(af, iph->protocol,
+					 &iph->daddr, pptr[1],
+					 &iph->saddr, pptr[0]);
+	}
+}
+
+static struct ip_vs_conn *
+tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		 int inverse)
+{
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		return ip_vs_conn_out_get(af, iph->protocol,
+					  &iph->saddr, pptr[0],
+					  &iph->daddr, pptr[1]);
+	} else {
+		return ip_vs_conn_out_get(af, iph->protocol,
+					  &iph->daddr, pptr[1],
+					  &iph->saddr, pptr[0]);
+	}
+}
+
+
+static int
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+		  int *verdict, struct ip_vs_conn **cpp)
+{
+	struct ip_vs_service *svc;
+	struct tcphdr _tcph, *th;
+	struct ip_vs_iphdr iph;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		*verdict = NF_DROP;
+		return 0;
+	}
+
+	if (th->syn &&
+	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
+				     th->dest))) {
+		if (ip_vs_todrop()) {
+			/*
+			 * It seems that we are very loaded.
+			 * We have to drop this packet :(
+			 */
+			ip_vs_service_put(svc);
+			*verdict = NF_DROP;
+			return 0;
+		}
+
+		/*
+		 * Let the virtual server select a real server for the
+		 * incoming connection, and create a connection entry.
+		 */
+		*cpp = ip_vs_schedule(svc, skb);
+		if (!*cpp) {
+			*verdict = ip_vs_leave(svc, skb, pp);
+			return 0;
+		}
+		ip_vs_service_put(svc);
+	}
+	return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(int af, struct tcphdr *tcph,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcph->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(tcph->check))));
+	else
+#endif
+	tcph->check =
+		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(tcph->check))));
+}
+
+
+static inline void
+tcp_partial_csum_update(int af, struct tcphdr *tcph,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcph->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(tcph->check))));
+	else
+#endif
+	tcph->check =
+		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(tcph->check))));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct tcphdr *tcph;
+	unsigned int tcphoff;
+	int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
+	oldlen = skb->len - tcphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/* Call application helper if needed */
+		if (!ip_vs_app_pkt_out(cp, skb))
+			return 0;
+	}
+
+	tcph = (void *)skb_network_header(skb) + tcphoff;
+	tcph->source = cp->vport;
+
+	/* Adjust TCP checksums */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - tcphoff));
+	} else if (!cp->app) {
+		/* Only port and addr are changed, do fast csum update */
+		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+				     cp->dport, cp->vport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		tcph->check = 0;
+		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
+						      &cp->caddr.in6,
+						      skb->len - tcphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
+							cp->caddr.ip,
+							skb->len - tcphoff,
+							cp->protocol,
+							skb->csum);
+
+		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+			  pp->name, tcph->check,
+			  (char*)&(tcph->check) - (char*)tcph);
+	}
+	return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct tcphdr *tcph;
+	unsigned int tcphoff;
+	int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
+	oldlen = skb->len - tcphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/*
+		 *	Attempt ip_vs_app call.
+		 *	It will fix ip_vs_conn and iph ack_seq stuff
+		 */
+		if (!ip_vs_app_pkt_in(cp, skb))
+			return 0;
+	}
+
+	tcph = (void *)skb_network_header(skb) + tcphoff;
+	tcph->dest = cp->dport;
+
+	/*
+	 *	Adjust TCP checksums
+	 */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - tcphoff));
+	} else if (!cp->app) {
+		/* Only port and addr are changed, do fast csum update */
+		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+				     cp->vport, cp->dport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		tcph->check = 0;
+		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			tcph->check = csum_ipv6_magic(&cp->caddr.in6,
+						      &cp->daddr.in6,
+						      skb->len - tcphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			tcph->check = csum_tcpudp_magic(cp->caddr.ip,
+							cp->daddr.ip,
+							skb->len - tcphoff,
+							cp->protocol,
+							skb->csum);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return 1;
+}
+
+
+static int
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_NONE:
+		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+	case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+		if (af == AF_INET6) {
+			if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					    &ipv6_hdr(skb)->daddr,
+					    skb->len - tcphoff,
+					    ipv6_hdr(skb)->nexthdr,
+					    skb->csum)) {
+				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+						 "Failed checksum for");
+				return 0;
+			}
+		} else
+#endif
+			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					      ip_hdr(skb)->daddr,
+					      skb->len - tcphoff,
+					      ip_hdr(skb)->protocol,
+					      skb->csum)) {
+				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+						 "Failed checksum for");
+				return 0;
+			}
+		break;
+	default:
+		/* No need to checksum. */
+		break;
+	}
+
+	return 1;
+}
+
+
+#define TCP_DIR_INPUT		0
+#define TCP_DIR_OUTPUT		4
+#define TCP_DIR_INPUT_ONLY	8
+
+static const int tcp_state_off[IP_VS_DIR_LAST] = {
+	[IP_VS_DIR_INPUT]		=	TCP_DIR_INPUT,
+	[IP_VS_DIR_OUTPUT]		=	TCP_DIR_OUTPUT,
+	[IP_VS_DIR_INPUT_ONLY]		=	TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ *	Timeout table[state]
+ */
+static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+	[IP_VS_TCP_S_NONE]		=	2*HZ,
+	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
+	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
+	[IP_VS_TCP_S_SYN_RECV]		=	1*60*HZ,
+	[IP_VS_TCP_S_FIN_WAIT]		=	2*60*HZ,
+	[IP_VS_TCP_S_TIME_WAIT]		=	2*60*HZ,
+	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
+	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
+	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
+	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
+	[IP_VS_TCP_S_SYNACK]		=	120*HZ,
+	[IP_VS_TCP_S_LAST]		=	2*HZ,
+};
+
+static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+	[IP_VS_TCP_S_NONE]		=	"NONE",
+	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
+	[IP_VS_TCP_S_SYN_SENT]		=	"SYN_SENT",
+	[IP_VS_TCP_S_SYN_RECV]		=	"SYN_RECV",
+	[IP_VS_TCP_S_FIN_WAIT]		=	"FIN_WAIT",
+	[IP_VS_TCP_S_TIME_WAIT]		=	"TIME_WAIT",
+	[IP_VS_TCP_S_CLOSE]		=	"CLOSE",
+	[IP_VS_TCP_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
+	[IP_VS_TCP_S_LAST_ACK]		=	"LAST_ACK",
+	[IP_VS_TCP_S_LISTEN]		=	"LISTEN",
+	[IP_VS_TCP_S_SYNACK]		=	"SYNACK",
+	[IP_VS_TCP_S_LAST]		=	"BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+	int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+	if (state >= IP_VS_TCP_S_LAST)
+		return "ERR!";
+	return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t *tcp_state_table = tcp_states;
+
+
+static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+{
+	int on = (flags & 1);		/* secure_tcp */
+
+	/*
+	** FIXME: change secure_tcp to independent sysctl var
+	** or make it per-service or per-app because it is valid
+	** for most if not for all of the applications. Something
+	** like "capabilities" (flags) for each object.
+	*/
+	tcp_state_table = (on? tcp_states_dos : tcp_states);
+}
+
+static int
+tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
+				       tcp_state_name_table, sname, to);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+	if (th->rst)
+		return 3;
+	if (th->syn)
+		return 0;
+	if (th->fin)
+		return 1;
+	if (th->ack)
+		return 2;
+	return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+	      int direction, struct tcphdr *th)
+{
+	int state_idx;
+	int new_state = IP_VS_TCP_S_CLOSE;
+	int state_off = tcp_state_off[direction];
+
+	/*
+	 *    Update state offset to INPUT_ONLY if necessary
+	 *    or delete NO_OUTPUT flag if output packet detected
+	 */
+	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+		if (state_off == TCP_DIR_OUTPUT)
+			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+		else
+			state_off = TCP_DIR_INPUT_ONLY;
+	}
+
+	if ((state_idx = tcp_state_idx(th)) < 0) {
+		IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+		goto tcp_state_out;
+	}
+
+	new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+  tcp_state_out:
+	if (new_state != cp->state) {
+		struct ip_vs_dest *dest = cp->dest;
+
+		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
+			      "%s:%d state: %s->%s conn->refcnt:%d\n",
+			      pp->name,
+			      ((state_off == TCP_DIR_OUTPUT) ?
+			       "output " : "input "),
+			      th->syn ? 'S' : '.',
+			      th->fin ? 'F' : '.',
+			      th->ack ? 'A' : '.',
+			      th->rst ? 'R' : '.',
+			      IP_VS_DBG_ADDR(cp->af, &cp->daddr),
+			      ntohs(cp->dport),
+			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+			      ntohs(cp->cport),
+			      tcp_state_name(cp->state),
+			      tcp_state_name(new_state),
+			      atomic_read(&cp->refcnt));
+
+		if (dest) {
+			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			    (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_dec(&dest->activeconns);
+				atomic_inc(&dest->inactconns);
+				cp->flags |= IP_VS_CONN_F_INACTIVE;
+			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_inc(&dest->activeconns);
+				atomic_dec(&dest->inactconns);
+				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			}
+		}
+	}
+
+	cp->timeout = pp->timeout_table[cp->state = new_state];
+}
+
+
+/*
+ *	Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+		     const struct sk_buff *skb,
+		     struct ip_vs_protocol *pp)
+{
+	struct tcphdr _tcph, *th;
+
+#ifdef CONFIG_IP_VS_IPV6
+	int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+	int ihl = ip_hdrlen(skb);
+#endif
+
+	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+
+	spin_lock(&cp->lock);
+	set_tcp_state(pp, cp, direction, th);
+	spin_unlock(&cp->lock);
+
+	return 1;
+}
+
+
+/*
+ *	Hash table for TCP application incarnations
+ */
+#define	TCP_APP_TAB_BITS	4
+#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
+#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
+
+static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(tcp_app_lock);
+
+static inline __u16 tcp_app_hashkey(__be16 port)
+{
+	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
+		& TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct ip_vs_app *inc)
+{
+	struct ip_vs_app *i;
+	__u16 hash;
+	__be16 port = inc->port;
+	int ret = 0;
+
+	hash = tcp_app_hashkey(port);
+
+	spin_lock_bh(&tcp_app_lock);
+	list_for_each_entry(i, &tcp_apps[hash], p_list) {
+		if (i->port == port) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	list_add(&inc->p_list, &tcp_apps[hash]);
+	atomic_inc(&ip_vs_protocol_tcp.appcnt);
+
+  out:
+	spin_unlock_bh(&tcp_app_lock);
+	return ret;
+}
+
+
+static void
+tcp_unregister_app(struct ip_vs_app *inc)
+{
+	spin_lock_bh(&tcp_app_lock);
+	atomic_dec(&ip_vs_protocol_tcp.appcnt);
+	list_del(&inc->p_list);
+	spin_unlock_bh(&tcp_app_lock);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+	int hash;
+	struct ip_vs_app *inc;
+	int result = 0;
+
+	/* Default binding: bind app only for NAT */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return 0;
+
+	/* Lookup application incarnations and bind the right one */
+	hash = tcp_app_hashkey(cp->vport);
+
+	spin_lock(&tcp_app_lock);
+	list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+		if (inc->port == cp->vport) {
+			if (unlikely(!ip_vs_app_inc_get(inc)))
+				break;
+			spin_unlock(&tcp_app_lock);
+
+			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+				      "%s:%u to app %s on port %u\n",
+				      __func__,
+				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+				      ntohs(cp->cport),
+				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+				      ntohs(cp->vport),
+				      inc->name, ntohs(inc->port));
+
+			cp->app = inc;
+			if (inc->init_conn)
+				result = inc->init_conn(inc, cp);
+			goto out;
+		}
+	}
+	spin_unlock(&tcp_app_lock);
+
+  out:
+	return result;
+}
+
+
+/*
+ *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+{
+	spin_lock(&cp->lock);
+	cp->state = IP_VS_TCP_S_LISTEN;
+	cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+	spin_unlock(&cp->lock);
+}
+
+
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+{
+	IP_VS_INIT_HASH_TABLE(tcp_apps);
+	pp->timeout_table = tcp_timeouts;
+}
+
+
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+	.name =			"TCP",
+	.protocol =		IPPROTO_TCP,
+	.num_states =		IP_VS_TCP_S_LAST,
+	.dont_defrag =		0,
+	.appcnt =		ATOMIC_INIT(0),
+	.init =			ip_vs_tcp_init,
+	.exit =			ip_vs_tcp_exit,
+	.register_app =		tcp_register_app,
+	.unregister_app =	tcp_unregister_app,
+	.conn_schedule =	tcp_conn_schedule,
+	.conn_in_get =		tcp_conn_in_get,
+	.conn_out_get =		tcp_conn_out_get,
+	.snat_handler =		tcp_snat_handler,
+	.dnat_handler =		tcp_dnat_handler,
+	.csum_check =		tcp_csum_check,
+	.state_name =		tcp_state_name,
+	.state_transition =	tcp_state_transition,
+	.app_conn_bind =	tcp_app_conn_bind,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	tcp_timeout_change,
+	.set_state_timeout =	tcp_set_state_timeout,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 00000000000..6eb6039d634
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,533 @@
+/*
+ * ip_vs_proto_udp.c:	UDP load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
+
+#include <net/ip_vs.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+
+static struct ip_vs_conn *
+udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		int inverse)
+{
+	struct ip_vs_conn *cp;
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_in_get(af, iph->protocol,
+				       &iph->saddr, pptr[0],
+				       &iph->daddr, pptr[1]);
+	} else {
+		cp = ip_vs_conn_in_get(af, iph->protocol,
+				       &iph->daddr, pptr[1],
+				       &iph->saddr, pptr[0]);
+	}
+
+	return cp;
+}
+
+
+static struct ip_vs_conn *
+udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		 int inverse)
+{
+	struct ip_vs_conn *cp;
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_out_get(af, iph->protocol,
+					&iph->saddr, pptr[0],
+					&iph->daddr, pptr[1]);
+	} else {
+		cp = ip_vs_conn_out_get(af, iph->protocol,
+					&iph->daddr, pptr[1],
+					&iph->saddr, pptr[0]);
+	}
+
+	return cp;
+}
+
+
+static int
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+		  int *verdict, struct ip_vs_conn **cpp)
+{
+	struct ip_vs_service *svc;
+	struct udphdr _udph, *uh;
+	struct ip_vs_iphdr iph;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
+	if (uh == NULL) {
+		*verdict = NF_DROP;
+		return 0;
+	}
+
+	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+				&iph.daddr, uh->dest);
+	if (svc) {
+		if (ip_vs_todrop()) {
+			/*
+			 * It seems that we are very loaded.
+			 * We have to drop this packet :(
+			 */
+			ip_vs_service_put(svc);
+			*verdict = NF_DROP;
+			return 0;
+		}
+
+		/*
+		 * Let the virtual server select a real server for the
+		 * incoming connection, and create a connection entry.
+		 */
+		*cpp = ip_vs_schedule(svc, skb);
+		if (!*cpp) {
+			*verdict = ip_vs_leave(svc, skb, pp);
+			return 0;
+		}
+		ip_vs_service_put(svc);
+	}
+	return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(int af, struct udphdr *uhdr,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		uhdr->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(uhdr->check))));
+	else
+#endif
+		uhdr->check =
+			csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(uhdr->check))));
+	if (!uhdr->check)
+		uhdr->check = CSUM_MANGLED_0;
+}
+
+static inline void
+udp_partial_csum_update(int af, struct udphdr *uhdr,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		uhdr->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(uhdr->check))));
+	else
+#endif
+	uhdr->check =
+		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				ip_vs_check_diff2(oldlen, newlen,
+						~csum_unfold(uhdr->check))));
+}
+
+
+static int
+udp_snat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct udphdr *udph;
+	unsigned int udphoff;
+	int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
+	oldlen = skb->len - udphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/*
+		 *	Call application helper if needed
+		 */
+		if (!ip_vs_app_pkt_out(cp, skb))
+			return 0;
+	}
+
+	udph = (void *)skb_network_header(skb) + udphoff;
+	udph->source = cp->vport;
+
+	/*
+	 *	Adjust UDP checksums
+	 */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - udphoff));
+	} else if (!cp->app && (udph->check != 0)) {
+		/* Only port and addr are changed, do fast csum update */
+		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+				     cp->dport, cp->vport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		udph->check = 0;
+		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			udph->check = csum_ipv6_magic(&cp->vaddr.in6,
+						      &cp->caddr.in6,
+						      skb->len - udphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			udph->check = csum_tcpudp_magic(cp->vaddr.ip,
+							cp->caddr.ip,
+							skb->len - udphoff,
+							cp->protocol,
+							skb->csum);
+		if (udph->check == 0)
+			udph->check = CSUM_MANGLED_0;
+		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+			  pp->name, udph->check,
+			  (char*)&(udph->check) - (char*)udph);
+	}
+	return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct udphdr *udph;
+	unsigned int udphoff;
+	int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
+	oldlen = skb->len - udphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/*
+		 *	Attempt ip_vs_app call.
+		 *	It will fix ip_vs_conn
+		 */
+		if (!ip_vs_app_pkt_in(cp, skb))
+			return 0;
+	}
+
+	udph = (void *)skb_network_header(skb) + udphoff;
+	udph->dest = cp->dport;
+
+	/*
+	 *	Adjust UDP checksums
+	 */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+					htonl(oldlen),
+					htonl(skb->len - udphoff));
+	} else if (!cp->app && (udph->check != 0)) {
+		/* Only port and addr are changed, do fast csum update */
+		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+				     cp->vport, cp->dport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		udph->check = 0;
+		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			udph->check = csum_ipv6_magic(&cp->caddr.in6,
+						      &cp->daddr.in6,
+						      skb->len - udphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			udph->check = csum_tcpudp_magic(cp->caddr.ip,
+							cp->daddr.ip,
+							skb->len - udphoff,
+							cp->protocol,
+							skb->csum);
+		if (udph->check == 0)
+			udph->check = CSUM_MANGLED_0;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return 1;
+}
+
+
+static int
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	struct udphdr _udph, *uh;
+	unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
+
+	uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+	if (uh == NULL)
+		return 0;
+
+	if (uh->check != 0) {
+		switch (skb->ip_summed) {
+		case CHECKSUM_NONE:
+			skb->csum = skb_checksum(skb, udphoff,
+						 skb->len - udphoff, 0);
+		case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+			if (af == AF_INET6) {
+				if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+						    &ipv6_hdr(skb)->daddr,
+						    skb->len - udphoff,
+						    ipv6_hdr(skb)->nexthdr,
+						    skb->csum)) {
+					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+							 "Failed checksum for");
+					return 0;
+				}
+			} else
+#endif
+				if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+						      ip_hdr(skb)->daddr,
+						      skb->len - udphoff,
+						      ip_hdr(skb)->protocol,
+						      skb->csum)) {
+					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+							 "Failed checksum for");
+					return 0;
+				}
+			break;
+		default:
+			/* No need to checksum. */
+			break;
+		}
+	}
+	return 1;
+}
+
+
+/*
+ *	Note: the caller guarantees that only one of register_app,
+ *	unregister_app or app_conn_bind is called each time.
+ */
+
+#define	UDP_APP_TAB_BITS	4
+#define	UDP_APP_TAB_SIZE	(1 << UDP_APP_TAB_BITS)
+#define	UDP_APP_TAB_MASK	(UDP_APP_TAB_SIZE - 1)
+
+static struct list_head udp_apps[UDP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(udp_app_lock);
+
+static inline __u16 udp_app_hashkey(__be16 port)
+{
+	return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
+		& UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct ip_vs_app *inc)
+{
+	struct ip_vs_app *i;
+	__u16 hash;
+	__be16 port = inc->port;
+	int ret = 0;
+
+	hash = udp_app_hashkey(port);
+
+
+	spin_lock_bh(&udp_app_lock);
+	list_for_each_entry(i, &udp_apps[hash], p_list) {
+		if (i->port == port) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	list_add(&inc->p_list, &udp_apps[hash]);
+	atomic_inc(&ip_vs_protocol_udp.appcnt);
+
+  out:
+	spin_unlock_bh(&udp_app_lock);
+	return ret;
+}
+
+
+static void
+udp_unregister_app(struct ip_vs_app *inc)
+{
+	spin_lock_bh(&udp_app_lock);
+	atomic_dec(&ip_vs_protocol_udp.appcnt);
+	list_del(&inc->p_list);
+	spin_unlock_bh(&udp_app_lock);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+	int hash;
+	struct ip_vs_app *inc;
+	int result = 0;
+
+	/* Default binding: bind app only for NAT */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return 0;
+
+	/* Lookup application incarnations and bind the right one */
+	hash = udp_app_hashkey(cp->vport);
+
+	spin_lock(&udp_app_lock);
+	list_for_each_entry(inc, &udp_apps[hash], p_list) {
+		if (inc->port == cp->vport) {
+			if (unlikely(!ip_vs_app_inc_get(inc)))
+				break;
+			spin_unlock(&udp_app_lock);
+
+			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+				      "%s:%u to app %s on port %u\n",
+				      __func__,
+				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+				      ntohs(cp->cport),
+				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+				      ntohs(cp->vport),
+				      inc->name, ntohs(inc->port));
+
+			cp->app = inc;
+			if (inc->init_conn)
+				result = inc->init_conn(inc, cp);
+			goto out;
+		}
+	}
+	spin_unlock(&udp_app_lock);
+
+  out:
+	return result;
+}
+
+
+static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+	[IP_VS_UDP_S_NORMAL]		=	5*60*HZ,
+	[IP_VS_UDP_S_LAST]		=	2*HZ,
+};
+
+static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+	[IP_VS_UDP_S_NORMAL]		=	"UDP",
+	[IP_VS_UDP_S_LAST]		=	"BUG!",
+};
+
+
+static int
+udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
+				       udp_state_name_table, sname, to);
+}
+
+static const char * udp_state_name(int state)
+{
+	if (state >= IP_VS_UDP_S_LAST)
+		return "ERR!";
+	return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static int
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+		     const struct sk_buff *skb,
+		     struct ip_vs_protocol *pp)
+{
+	cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+	return 1;
+}
+
+static void udp_init(struct ip_vs_protocol *pp)
+{
+	IP_VS_INIT_HASH_TABLE(udp_apps);
+	pp->timeout_table = udp_timeouts;
+}
+
+static void udp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+	.name =			"UDP",
+	.protocol =		IPPROTO_UDP,
+	.num_states =		IP_VS_UDP_S_LAST,
+	.dont_defrag =		0,
+	.init =			udp_init,
+	.exit =			udp_exit,
+	.conn_schedule =	udp_conn_schedule,
+	.conn_in_get =		udp_conn_in_get,
+	.conn_out_get =		udp_conn_out_get,
+	.snat_handler =		udp_snat_handler,
+	.dnat_handler =		udp_dnat_handler,
+	.csum_check =		udp_csum_check,
+	.state_transition =	udp_state_transition,
+	.state_name =		udp_state_name,
+	.register_app =		udp_register_app,
+	.unregister_app =	udp_unregister_app,
+	.app_conn_bind =	udp_app_conn_bind,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	NULL,
+	.set_state_timeout =	udp_set_state_timeout,
+};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
new file mode 100644
index 00000000000..a22195f68ac
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -0,0 +1,112 @@
+/*
+ * IPVS:        Round-Robin Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
+ *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_rr_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct list_head *p, *q;
+	struct ip_vs_dest *dest;
+
+	IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
+
+	write_lock(&svc->sched_lock);
+	p = (struct list_head *)svc->sched_data;
+	p = p->next;
+	q = p;
+	do {
+		/* skip list head */
+		if (q == &svc->destinations) {
+			q = q->next;
+			continue;
+		}
+
+		dest = list_entry(q, struct ip_vs_dest, n_list);
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0)
+			/* HIT */
+			goto out;
+		q = q->next;
+	} while (q != p);
+	write_unlock(&svc->sched_lock);
+	return NULL;
+
+  out:
+	svc->sched_data = q;
+	write_unlock(&svc->sched_lock);
+	IP_VS_DBG_BUF(6, "RR: server %s:%u "
+		      "activeconns %d refcnt %d weight %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+		      atomic_read(&dest->activeconns),
+		      atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+	return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+	.name =			"rr",			/* name */
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
+	.init_service =		ip_vs_rr_init_svc,
+	.update_service =	ip_vs_rr_update_svc,
+	.schedule =		ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
new file mode 100644
index 00000000000..a46ad9e3501
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/*
+ *  IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_sched_lock);
+
+
+/*
+ *  Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+			 struct ip_vs_scheduler *scheduler)
+{
+	int ret;
+
+	if (svc == NULL) {
+		IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
+		return -EINVAL;
+	}
+	if (scheduler == NULL) {
+		IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
+		return -EINVAL;
+	}
+
+	svc->scheduler = scheduler;
+
+	if (scheduler->init_service) {
+		ret = scheduler->init_service(svc);
+		if (ret) {
+			IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ *  Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+	struct ip_vs_scheduler *sched;
+
+	if (svc == NULL) {
+		IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
+		return -EINVAL;
+	}
+
+	sched = svc->scheduler;
+	if (sched == NULL) {
+		IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
+		return -EINVAL;
+	}
+
+	if (sched->done_service) {
+		if (sched->done_service(svc) != 0) {
+			IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
+			return -EINVAL;
+		}
+	}
+
+	svc->scheduler = NULL;
+	return 0;
+}
+
+
+/*
+ *  Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+	struct ip_vs_scheduler *sched;
+
+	IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
+		  sched_name);
+
+	read_lock_bh(&__ip_vs_sched_lock);
+
+	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+		/*
+		 * Test and get the modules atomically
+		 */
+		if (sched->module && !try_module_get(sched->module)) {
+			/*
+			 * This scheduler is just deleted
+			 */
+			continue;
+		}
+		if (strcmp(sched_name, sched->name)==0) {
+			/* HIT */
+			read_unlock_bh(&__ip_vs_sched_lock);
+			return sched;
+		}
+		if (sched->module)
+			module_put(sched->module);
+	}
+
+	read_unlock_bh(&__ip_vs_sched_lock);
+	return NULL;
+}
+
+
+/*
+ *  Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+	struct ip_vs_scheduler *sched;
+
+	/*
+	 *  Search for the scheduler by sched_name
+	 */
+	sched = ip_vs_sched_getbyname(sched_name);
+
+	/*
+	 *  If scheduler not found, load the module and search again
+	 */
+	if (sched == NULL) {
+		request_module("ip_vs_%s", sched_name);
+		sched = ip_vs_sched_getbyname(sched_name);
+	}
+
+	return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+	if (scheduler->module)
+		module_put(scheduler->module);
+}
+
+
+/*
+ *  Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+	struct ip_vs_scheduler *sched;
+
+	if (!scheduler) {
+		IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
+		return -EINVAL;
+	}
+
+	if (!scheduler->name) {
+		IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
+		return -EINVAL;
+	}
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	write_lock_bh(&__ip_vs_sched_lock);
+
+	if (!list_empty(&scheduler->n_list)) {
+		write_unlock_bh(&__ip_vs_sched_lock);
+		ip_vs_use_count_dec();
+		IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+			  "already linked\n", scheduler->name);
+		return -EINVAL;
+	}
+
+	/*
+	 *  Make sure that the scheduler with this name doesn't exist
+	 *  in the scheduler list.
+	 */
+	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+		if (strcmp(scheduler->name, sched->name) == 0) {
+			write_unlock_bh(&__ip_vs_sched_lock);
+			ip_vs_use_count_dec();
+			IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+					"already existed in the system\n",
+					scheduler->name);
+			return -EINVAL;
+		}
+	}
+	/*
+	 *	Add it into the d-linked scheduler list
+	 */
+	list_add(&scheduler->n_list, &ip_vs_schedulers);
+	write_unlock_bh(&__ip_vs_sched_lock);
+
+	IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
+
+	return 0;
+}
+
+
+/*
+ *  Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+	if (!scheduler) {
+		IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
+		return -EINVAL;
+	}
+
+	write_lock_bh(&__ip_vs_sched_lock);
+	if (list_empty(&scheduler->n_list)) {
+		write_unlock_bh(&__ip_vs_sched_lock);
+		IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
+			  "is not in the list. failed\n", scheduler->name);
+		return -EINVAL;
+	}
+
+	/*
+	 *	Remove it from the d-linked scheduler list
+	 */
+	list_del(&scheduler->n_list);
+	write_unlock_bh(&__ip_vs_sched_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
+
+	return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
new file mode 100644
index 00000000000..7d2f22f04b8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -0,0 +1,140 @@
+/*
+ * IPVS:        Shortest Expected Delay scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We only use the active connection number in the cost
+	 * calculation here.
+	 */
+	return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least;
+	unsigned int loh, doh;
+
+	IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *	(server expected overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_sed_dest_overhead(least);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		doh = ip_vs_sed_dest_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "SED: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+	.name =			"sed",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
+	.schedule =		ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
new file mode 100644
index 00000000000..1d96de27fef
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -0,0 +1,258 @@
+/*
+ * IPVS:        Source Hashing scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[src_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) or (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS        8
+#endif
+#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
+
+
+/*
+ *	Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(__be32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
+{
+	return (tbl[ip_vs_sh_hashkey(addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_sh_bucket *b;
+	struct list_head *p;
+	struct ip_vs_dest *dest;
+
+	b = tbl;
+	p = &svc->destinations;
+	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+		if (list_empty(p)) {
+			b->dest = NULL;
+		} else {
+			if (p == &svc->destinations)
+				p = p->next;
+
+			dest = list_entry(p, struct ip_vs_dest, n_list);
+			atomic_inc(&dest->refcnt);
+			b->dest = dest;
+
+			p = p->next;
+		}
+		b++;
+	}
+	return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+	int i;
+	struct ip_vs_sh_bucket *b;
+
+	b = tbl;
+	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+		if (b->dest) {
+			atomic_dec(&b->dest->refcnt);
+			b->dest = NULL;
+		}
+		b++;
+	}
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl;
+
+	/* allocate the SH table for this service */
+	tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+		      GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_sh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_sh_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+	return 0;
+}
+
+
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_sh_flush(tbl);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_sh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_sh_bucket *tbl;
+	struct iphdr *iph = ip_hdr(skb);
+
+	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+	tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+	dest = ip_vs_sh_get(tbl, iph->saddr);
+	if (!dest
+	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+	    || atomic_read(&dest->weight) <= 0
+	    || is_overloaded(dest)) {
+		return NULL;
+	}
+
+	IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(iph->saddr),
+		  NIPQUAD(dest->addr.ip),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+	.name =			"sh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list	 =		LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	0,
+#endif
+	.init_service =		ip_vs_sh_init_svc,
+	.done_service =		ip_vs_sh_done_svc,
+	.update_service =	ip_vs_sh_update_svc,
+	.schedule =		ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
new file mode 100644
index 00000000000..de5e7e118ee
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -0,0 +1,942 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync:  sync connection info from master load balancer to backups
+ *              through multicast
+ *
+ * Changes:
+ *	Alexandre Cassen	:	Added master & backup support at a time.
+ *	Alexandre Cassen	:	Added SyncID support for incoming sync
+ *					messages filtering.
+ *	Justin Ossevoort	:	Fix endian problem on sync message size.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <linux/udp.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/kernel.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT  8848          /* multicast port */
+
+
+/*
+ *	IPVS sync connection entry
+ */
+struct ip_vs_sync_conn {
+	__u8			reserved;
+
+	/* Protocol, addresses and port numbers */
+	__u8			protocol;       /* Which protocol (TCP/UDP) */
+	__be16			cport;
+	__be16                  vport;
+	__be16                  dport;
+	__be32                  caddr;          /* client address */
+	__be32                  vaddr;          /* virtual address */
+	__be32                  daddr;          /* destination address */
+
+	/* Flags and state transition */
+	__be16                  flags;          /* status flags */
+	__be16                  state;          /* state info */
+
+	/* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
+	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+};
+
+struct ip_vs_sync_thread_data {
+	struct socket *sock;
+	char *buf;
+};
+
+#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
+#define FULL_CONN_SIZE  \
+(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+  The master mulitcasts messages to the backup load balancers in the
+  following format.
+
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (1)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                            .                                  |
+      |                            .                                  |
+      |                            .                                  |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (n)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+#define SYNC_MESG_HEADER_LEN	4
+#define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
+
+struct ip_vs_sync_mesg {
+	__u8                    nr_conns;
+	__u8                    syncid;
+	__u16                   size;
+
+	/* ip_vs_sync_conn entries start here */
+};
+
+/* the maximum length of sync (sending/receiving) message */
+static int sync_send_mesg_maxlen;
+static int sync_recv_mesg_maxlen;
+
+struct ip_vs_sync_buff {
+	struct list_head        list;
+	unsigned long           firstuse;
+
+	/* pointers for the message data */
+	struct ip_vs_sync_mesg  *mesg;
+	unsigned char           *head;
+	unsigned char           *end;
+};
+
+
+/* the sync_buff list head and the lock */
+static LIST_HEAD(ip_vs_sync_queue);
+static DEFINE_SPINLOCK(ip_vs_sync_lock);
+
+/* current sync_buff for accepting new conn entries */
+static struct ip_vs_sync_buff   *curr_sb = NULL;
+static DEFINE_SPINLOCK(curr_sb_lock);
+
+/* ipvs sync daemon state */
+volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
+volatile int ip_vs_master_syncid = 0;
+volatile int ip_vs_backup_syncid = 0;
+
+/* multicast interface name */
+char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+
+/* sync daemon tasks */
+static struct task_struct *sync_master_thread;
+static struct task_struct *sync_backup_thread;
+
+/* multicast addr */
+static struct sockaddr_in mcast_addr = {
+	.sin_family		= AF_INET,
+	.sin_port		= __constant_htons(IP_VS_SYNC_PORT),
+	.sin_addr.s_addr	= __constant_htonl(IP_VS_SYNC_GROUP),
+};
+
+
+static inline struct ip_vs_sync_buff *sb_dequeue(void)
+{
+	struct ip_vs_sync_buff *sb;
+
+	spin_lock_bh(&ip_vs_sync_lock);
+	if (list_empty(&ip_vs_sync_queue)) {
+		sb = NULL;
+	} else {
+		sb = list_entry(ip_vs_sync_queue.next,
+				struct ip_vs_sync_buff,
+				list);
+		list_del(&sb->list);
+	}
+	spin_unlock_bh(&ip_vs_sync_lock);
+
+	return sb;
+}
+
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+{
+	struct ip_vs_sync_buff *sb;
+
+	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+		return NULL;
+
+	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+		kfree(sb);
+		return NULL;
+	}
+	sb->mesg->nr_conns = 0;
+	sb->mesg->syncid = ip_vs_master_syncid;
+	sb->mesg->size = 4;
+	sb->head = (unsigned char *)sb->mesg + 4;
+	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+	sb->firstuse = jiffies;
+	return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+	kfree(sb->mesg);
+	kfree(sb);
+}
+
+static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+{
+	spin_lock(&ip_vs_sync_lock);
+	if (ip_vs_sync_state & IP_VS_STATE_MASTER)
+		list_add_tail(&sb->list, &ip_vs_sync_queue);
+	else
+		ip_vs_sync_buff_release(sb);
+	spin_unlock(&ip_vs_sync_lock);
+}
+
+/*
+ *	Get the current sync buffer if it has been created for more
+ *	than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(unsigned long time)
+{
+	struct ip_vs_sync_buff *sb;
+
+	spin_lock_bh(&curr_sb_lock);
+	if (curr_sb && (time == 0 ||
+			time_before(jiffies - curr_sb->firstuse, time))) {
+		sb = curr_sb;
+		curr_sb = NULL;
+	} else
+		sb = NULL;
+	spin_unlock_bh(&curr_sb_lock);
+	return sb;
+}
+
+
+/*
+ *      Add an ip_vs_conn information into the current sync_buff.
+ *      Called by ip_vs_in.
+ */
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
+{
+	struct ip_vs_sync_mesg *m;
+	struct ip_vs_sync_conn *s;
+	int len;
+
+	spin_lock(&curr_sb_lock);
+	if (!curr_sb) {
+		if (!(curr_sb=ip_vs_sync_buff_create())) {
+			spin_unlock(&curr_sb_lock);
+			IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
+			return;
+		}
+	}
+
+	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+		SIMPLE_CONN_SIZE;
+	m = curr_sb->mesg;
+	s = (struct ip_vs_sync_conn *)curr_sb->head;
+
+	/* copy members */
+	s->protocol = cp->protocol;
+	s->cport = cp->cport;
+	s->vport = cp->vport;
+	s->dport = cp->dport;
+	s->caddr = cp->caddr.ip;
+	s->vaddr = cp->vaddr.ip;
+	s->daddr = cp->daddr.ip;
+	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+	s->state = htons(cp->state);
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+		struct ip_vs_sync_conn_options *opt =
+			(struct ip_vs_sync_conn_options *)&s[1];
+		memcpy(opt, &cp->in_seq, sizeof(*opt));
+	}
+
+	m->nr_conns++;
+	m->size += len;
+	curr_sb->head += len;
+
+	/* check if there is a space for next one */
+	if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
+		sb_queue_tail(curr_sb);
+		curr_sb = NULL;
+	}
+	spin_unlock(&curr_sb_lock);
+
+	/* synchronize its controller if it has */
+	if (cp->control)
+		ip_vs_sync_conn(cp->control);
+}
+
+
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ */
+static void ip_vs_process_message(const char *buffer, const size_t buflen)
+{
+	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+	struct ip_vs_sync_conn *s;
+	struct ip_vs_sync_conn_options *opt;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_dest *dest;
+	char *p;
+	int i;
+
+	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
+		IP_VS_ERR_RL("sync message header too short\n");
+		return;
+	}
+
+	/* Convert size back to host byte order */
+	m->size = ntohs(m->size);
+
+	if (buflen != m->size) {
+		IP_VS_ERR_RL("bogus sync message size\n");
+		return;
+	}
+
+	/* SyncID sanity check */
+	if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+		IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
+			  m->syncid);
+		return;
+	}
+
+	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+	for (i=0; i<m->nr_conns; i++) {
+		unsigned flags, state;
+
+		if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
+			IP_VS_ERR_RL("bogus conn in sync message\n");
+			return;
+		}
+		s = (struct ip_vs_sync_conn *) p;
+		flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
+		flags &= ~IP_VS_CONN_F_HASHED;
+		if (flags & IP_VS_CONN_F_SEQ_MASK) {
+			opt = (struct ip_vs_sync_conn_options *)&s[1];
+			p += FULL_CONN_SIZE;
+			if (p > buffer+buflen) {
+				IP_VS_ERR_RL("bogus conn options in sync message\n");
+				return;
+			}
+		} else {
+			opt = NULL;
+			p += SIMPLE_CONN_SIZE;
+		}
+
+		state = ntohs(s->state);
+		if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+			pp = ip_vs_proto_get(s->protocol);
+			if (!pp) {
+				IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+					s->protocol);
+				continue;
+			}
+			if (state >= pp->num_states) {
+				IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+					pp->name, state);
+				continue;
+			}
+		} else {
+			/* protocol in templates is not used for state/timeout */
+			pp = NULL;
+			if (state > 0) {
+				IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+					state);
+				state = 0;
+			}
+		}
+
+		if (!(flags & IP_VS_CONN_F_TEMPLATE))
+			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
+					       (union nf_inet_addr *)&s->caddr,
+					       s->cport,
+					       (union nf_inet_addr *)&s->vaddr,
+					       s->vport);
+		else
+			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
+					     (union nf_inet_addr *)&s->caddr,
+					     s->cport,
+					     (union nf_inet_addr *)&s->vaddr,
+					     s->vport);
+		if (!cp) {
+			/*
+			 * Find the appropriate destination for the connection.
+			 * If it is not found the connection will remain unbound
+			 * but still handled.
+			 */
+			dest = ip_vs_find_dest(AF_INET,
+					       (union nf_inet_addr *)&s->daddr,
+					       s->dport,
+					       (union nf_inet_addr *)&s->vaddr,
+					       s->vport,
+					       s->protocol);
+			/*  Set the approprite ativity flag */
+			if (s->protocol == IPPROTO_TCP) {
+				if (state != IP_VS_TCP_S_ESTABLISHED)
+					flags |= IP_VS_CONN_F_INACTIVE;
+				else
+					flags &= ~IP_VS_CONN_F_INACTIVE;
+			}
+			cp = ip_vs_conn_new(AF_INET, s->protocol,
+					    (union nf_inet_addr *)&s->caddr,
+					    s->cport,
+					    (union nf_inet_addr *)&s->vaddr,
+					    s->vport,
+					    (union nf_inet_addr *)&s->daddr,
+					    s->dport,
+					    flags, dest);
+			if (dest)
+				atomic_dec(&dest->refcnt);
+			if (!cp) {
+				IP_VS_ERR("ip_vs_conn_new failed\n");
+				return;
+			}
+		} else if (!cp->dest) {
+			dest = ip_vs_try_bind_dest(cp);
+			if (dest)
+				atomic_dec(&dest->refcnt);
+		} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+			   (cp->state != state)) {
+			/* update active/inactive flag for the connection */
+			dest = cp->dest;
+			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				(state != IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_dec(&dest->activeconns);
+				atomic_inc(&dest->inactconns);
+				cp->flags |= IP_VS_CONN_F_INACTIVE;
+			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				(state == IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_inc(&dest->activeconns);
+				atomic_dec(&dest->inactconns);
+				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			}
+		}
+
+		if (opt)
+			memcpy(&cp->in_seq, opt, sizeof(*opt));
+		atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+		cp->state = state;
+		cp->old_state = cp->state;
+		/*
+		 * We can not recover the right timeout for templates
+		 * in all cases, we can not find the right fwmark
+		 * virtual service. If needed, we can do it for
+		 * non-fwmark persistent services.
+		 */
+		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
+			cp->timeout = pp->timeout_table[state];
+		else
+			cp->timeout = (3*60*HZ);
+		ip_vs_conn_put(cp);
+	}
+}
+
+
+/*
+ *      Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+	lock_sock(sk);
+	inet->mc_loop = loop ? 1 : 0;
+	release_sock(sk);
+}
+
+/*
+ *      Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+	lock_sock(sk);
+	inet->mc_ttl = ttl;
+	release_sock(sk);
+}
+
+/*
+ *      Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+	struct net_device *dev;
+	struct inet_sock *inet = inet_sk(sk);
+
+	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+		return -ENODEV;
+
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	lock_sock(sk);
+	inet->mc_index = dev->ifindex;
+	/*  inet->mc_addr  = 0; */
+	release_sock(sk);
+
+	return 0;
+}
+
+
+/*
+ *	Set the maximum length of sync message according to the
+ *	specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(int sync_state)
+{
+	struct net_device *dev;
+	int num;
+
+	if (sync_state == IP_VS_STATE_MASTER) {
+		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+			return -ENODEV;
+
+		num = (dev->mtu - sizeof(struct iphdr) -
+		       sizeof(struct udphdr) -
+		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+		sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
+		IP_VS_DBG(7, "setting the maximum length of sync sending "
+			  "message %d.\n", sync_send_mesg_maxlen);
+	} else if (sync_state == IP_VS_STATE_BACKUP) {
+		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+			return -ENODEV;
+
+		sync_recv_mesg_maxlen = dev->mtu -
+			sizeof(struct iphdr) - sizeof(struct udphdr);
+		IP_VS_DBG(7, "setting the maximum length of sync receiving "
+			  "message %d.\n", sync_recv_mesg_maxlen);
+	}
+
+	return 0;
+}
+
+
+/*
+ *      Join a multicast group.
+ *      the group is specified by a class D multicast address 224.0.0.0/8
+ *      in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+	struct ip_mreqn mreq;
+	struct net_device *dev;
+	int ret;
+
+	memset(&mreq, 0, sizeof(mreq));
+	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+		return -ENODEV;
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	mreq.imr_ifindex = dev->ifindex;
+
+	lock_sock(sk);
+	ret = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+
+	return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+	struct net_device *dev;
+	__be32 addr;
+	struct sockaddr_in sin;
+
+	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+		return -ENODEV;
+
+	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	if (!addr)
+		IP_VS_ERR("You probably need to specify IP address on "
+			  "multicast interface.\n");
+
+	IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
+		  ifname, NIPQUAD(addr));
+
+	/* Now bind the socket with the address of multicast interface */
+	sin.sin_family	     = AF_INET;
+	sin.sin_addr.s_addr  = addr;
+	sin.sin_port         = 0;
+
+	return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ *      Set up sending multicast socket over UDP
+ */
+static struct socket * make_send_sock(void)
+{
+	struct socket *sock;
+	int result;
+
+	/* First create a socket */
+	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (result < 0) {
+		IP_VS_ERR("Error during creation of socket; terminating\n");
+		return ERR_PTR(result);
+	}
+
+	result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+	if (result < 0) {
+		IP_VS_ERR("Error setting outbound mcast interface\n");
+		goto error;
+	}
+
+	set_mcast_loop(sock->sk, 0);
+	set_mcast_ttl(sock->sk, 1);
+
+	result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+	if (result < 0) {
+		IP_VS_ERR("Error binding address of the mcast interface\n");
+		goto error;
+	}
+
+	result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
+			sizeof(struct sockaddr), 0);
+	if (result < 0) {
+		IP_VS_ERR("Error connecting to the multicast addr\n");
+		goto error;
+	}
+
+	return sock;
+
+  error:
+	sock_release(sock);
+	return ERR_PTR(result);
+}
+
+
+/*
+ *      Set up receiving multicast socket over UDP
+ */
+static struct socket * make_receive_sock(void)
+{
+	struct socket *sock;
+	int result;
+
+	/* First create a socket */
+	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (result < 0) {
+		IP_VS_ERR("Error during creation of socket; terminating\n");
+		return ERR_PTR(result);
+	}
+
+	/* it is equivalent to the REUSEADDR option in user-space */
+	sock->sk->sk_reuse = 1;
+
+	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
+			sizeof(struct sockaddr));
+	if (result < 0) {
+		IP_VS_ERR("Error binding to the multicast addr\n");
+		goto error;
+	}
+
+	/* join the multicast group */
+	result = join_mcast_group(sock->sk,
+			(struct in_addr *) &mcast_addr.sin_addr,
+			ip_vs_backup_mcast_ifn);
+	if (result < 0) {
+		IP_VS_ERR("Error joining to the multicast group\n");
+		goto error;
+	}
+
+	return sock;
+
+  error:
+	sock_release(sock);
+	return ERR_PTR(result);
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+	struct msghdr	msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+	struct kvec	iov;
+	int		len;
+
+	EnterFunction(7);
+	iov.iov_base     = (void *)buffer;
+	iov.iov_len      = length;
+
+	len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+	LeaveFunction(7);
+	return len;
+}
+
+static void
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+	int msize;
+
+	msize = msg->size;
+
+	/* Put size in network byte order */
+	msg->size = htons(msg->size);
+
+	if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
+		IP_VS_ERR("ip_vs_send_async error\n");
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+	struct msghdr		msg = {NULL,};
+	struct kvec		iov;
+	int			len;
+
+	EnterFunction(7);
+
+	/* Receive a packet */
+	iov.iov_base     = buffer;
+	iov.iov_len      = (size_t)buflen;
+
+	len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+
+	if (len < 0)
+		return -1;
+
+	LeaveFunction(7);
+	return len;
+}
+
+
+static int sync_thread_master(void *data)
+{
+	struct ip_vs_sync_thread_data *tinfo = data;
+	struct ip_vs_sync_buff *sb;
+
+	IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
+		   "syncid = %d\n",
+		   ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+
+	while (!kthread_should_stop()) {
+		while ((sb = sb_dequeue())) {
+			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
+			ip_vs_sync_buff_release(sb);
+		}
+
+		/* check if entries stay in curr_sb for 2 seconds */
+		sb = get_curr_sync_buff(2 * HZ);
+		if (sb) {
+			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
+			ip_vs_sync_buff_release(sb);
+		}
+
+		schedule_timeout_interruptible(HZ);
+	}
+
+	/* clean up the sync_buff queue */
+	while ((sb=sb_dequeue())) {
+		ip_vs_sync_buff_release(sb);
+	}
+
+	/* clean up the current sync_buff */
+	if ((sb = get_curr_sync_buff(0))) {
+		ip_vs_sync_buff_release(sb);
+	}
+
+	/* release the sending multicast socket */
+	sock_release(tinfo->sock);
+	kfree(tinfo);
+
+	return 0;
+}
+
+
+static int sync_thread_backup(void *data)
+{
+	struct ip_vs_sync_thread_data *tinfo = data;
+	int len;
+
+	IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
+		   "syncid = %d\n",
+		   ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
+			 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
+			 || kthread_should_stop());
+
+		/* do we have data now? */
+		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+			len = ip_vs_receive(tinfo->sock, tinfo->buf,
+					sync_recv_mesg_maxlen);
+			if (len <= 0) {
+				IP_VS_ERR("receiving message error\n");
+				break;
+			}
+
+			/* disable bottom half, because it accesses the data
+			   shared by softirq while getting/creating conns */
+			local_bh_disable();
+			ip_vs_process_message(tinfo->buf, len);
+			local_bh_enable();
+		}
+	}
+
+	/* release the sending multicast socket */
+	sock_release(tinfo->sock);
+	kfree(tinfo->buf);
+	kfree(tinfo);
+
+	return 0;
+}
+
+
+int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+{
+	struct ip_vs_sync_thread_data *tinfo;
+	struct task_struct **realtask, *task;
+	struct socket *sock;
+	char *name, *buf = NULL;
+	int (*threadfn)(void *data);
+	int result = -ENOMEM;
+
+	IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
+	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+		  sizeof(struct ip_vs_sync_conn));
+
+	if (state == IP_VS_STATE_MASTER) {
+		if (sync_master_thread)
+			return -EEXIST;
+
+		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
+			sizeof(ip_vs_master_mcast_ifn));
+		ip_vs_master_syncid = syncid;
+		realtask = &sync_master_thread;
+		name = "ipvs_syncmaster";
+		threadfn = sync_thread_master;
+		sock = make_send_sock();
+	} else if (state == IP_VS_STATE_BACKUP) {
+		if (sync_backup_thread)
+			return -EEXIST;
+
+		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
+			sizeof(ip_vs_backup_mcast_ifn));
+		ip_vs_backup_syncid = syncid;
+		realtask = &sync_backup_thread;
+		name = "ipvs_syncbackup";
+		threadfn = sync_thread_backup;
+		sock = make_receive_sock();
+	} else {
+		return -EINVAL;
+	}
+
+	if (IS_ERR(sock)) {
+		result = PTR_ERR(sock);
+		goto out;
+	}
+
+	set_sync_mesg_maxlen(state);
+	if (state == IP_VS_STATE_BACKUP) {
+		buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+		if (!buf)
+			goto outsocket;
+	}
+
+	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+	if (!tinfo)
+		goto outbuf;
+
+	tinfo->sock = sock;
+	tinfo->buf = buf;
+
+	task = kthread_run(threadfn, tinfo, name);
+	if (IS_ERR(task)) {
+		result = PTR_ERR(task);
+		goto outtinfo;
+	}
+
+	/* mark as active */
+	*realtask = task;
+	ip_vs_sync_state |= state;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	return 0;
+
+outtinfo:
+	kfree(tinfo);
+outbuf:
+	kfree(buf);
+outsocket:
+	sock_release(sock);
+out:
+	return result;
+}
+
+
+int stop_sync_thread(int state)
+{
+	IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
+
+	if (state == IP_VS_STATE_MASTER) {
+		if (!sync_master_thread)
+			return -ESRCH;
+
+		IP_VS_INFO("stopping master sync thread %d ...\n",
+			   task_pid_nr(sync_master_thread));
+
+		/*
+		 * The lock synchronizes with sb_queue_tail(), so that we don't
+		 * add sync buffers to the queue, when we are already in
+		 * progress of stopping the master sync daemon.
+		 */
+
+		spin_lock_bh(&ip_vs_sync_lock);
+		ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
+		spin_unlock_bh(&ip_vs_sync_lock);
+		kthread_stop(sync_master_thread);
+		sync_master_thread = NULL;
+	} else if (state == IP_VS_STATE_BACKUP) {
+		if (!sync_backup_thread)
+			return -ESRCH;
+
+		IP_VS_INFO("stopping backup sync thread %d ...\n",
+			   task_pid_nr(sync_backup_thread));
+
+		ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
+		kthread_stop(sync_backup_thread);
+		sync_backup_thread = NULL;
+	} else {
+		return -EINVAL;
+	}
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
new file mode 100644
index 00000000000..8c596e71259
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -0,0 +1,128 @@
+/*
+ * IPVS:        Weighted Least-Connection Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
+ *     Wensong Zhang            :     changed to use the inactconns in scheduling
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We think the overhead of processing active connections is 256
+	 * times higher than that of inactive connections in average. (This
+	 * 256 times might not be accurate, we will change it later) We
+	 * use the following formula to estimate the overhead now:
+	 *		  dest->activeconns*256 + dest->inactconns
+	 */
+	return (atomic_read(&dest->activeconns) << 8) +
+		atomic_read(&dest->inactconns);
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least;
+	unsigned int loh, doh;
+
+	IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *		  (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_wlc_dest_overhead(least);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		doh = ip_vs_wlc_dest_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "WLC: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+	.name =			"wlc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
+	.schedule =		ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
new file mode 100644
index 00000000000..7ea92fed50b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -0,0 +1,237 @@
+/*
+ * IPVS:        Weighted Round-Robin Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
+ *     Julian Anastasov         :     fixed the bug of returning destination
+ *                                    with weight 0 when all weights are zero
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+	struct list_head *cl;	/* current list head */
+	int cw;			/* current weight */
+	int mw;			/* maximum weight */
+	int di;			/* decreasing interval */
+};
+
+
+/*
+ *    Get the gcd of server weights
+ */
+static int gcd(int a, int b)
+{
+	int c;
+
+	while ((c = a % b)) {
+		a = b;
+		b = c;
+	}
+	return b;
+}
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight;
+	int g = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		weight = atomic_read(&dest->weight);
+		if (weight > 0) {
+			if (g > 0)
+				g = gcd(weight, g);
+			else
+				g = weight;
+		}
+	}
+	return g ? g : 1;
+}
+
+
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (atomic_read(&dest->weight) > weight)
+			weight = atomic_read(&dest->weight);
+	}
+
+	return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_wrr_mark *mark;
+
+	/*
+	 *    Allocate the mark variable for WRR scheduling
+	 */
+	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+	if (mark == NULL) {
+		IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	mark->cl = &svc->destinations;
+	mark->cw = 0;
+	mark->mw = ip_vs_wrr_max_weight(svc);
+	mark->di = ip_vs_wrr_gcd_weight(svc);
+	svc->sched_data = mark;
+
+	return 0;
+}
+
+
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+	/*
+	 *    Release the mark variable
+	 */
+	kfree(svc->sched_data);
+
+	return 0;
+}
+
+
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+	mark->cl = &svc->destinations;
+	mark->mw = ip_vs_wrr_max_weight(svc);
+	mark->di = ip_vs_wrr_gcd_weight(svc);
+	if (mark->cw > mark->mw)
+		mark->cw = 0;
+	return 0;
+}
+
+
+/*
+ *    Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_wrr_mark *mark = svc->sched_data;
+	struct list_head *p;
+
+	IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
+
+	/*
+	 * This loop will always terminate, because mark->cw in (0, max_weight]
+	 * and at least one server has its weight equal to max_weight.
+	 */
+	write_lock(&svc->sched_lock);
+	p = mark->cl;
+	while (1) {
+		if (mark->cl == &svc->destinations) {
+			/* it is at the head of the destination list */
+
+			if (mark->cl == mark->cl->next) {
+				/* no dest entry */
+				dest = NULL;
+				goto out;
+			}
+
+			mark->cl = svc->destinations.next;
+			mark->cw -= mark->di;
+			if (mark->cw <= 0) {
+				mark->cw = mark->mw;
+				/*
+				 * Still zero, which means no available servers.
+				 */
+				if (mark->cw == 0) {
+					mark->cl = &svc->destinations;
+					IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
+						   "no available servers\n");
+					dest = NULL;
+					goto out;
+				}
+			}
+		} else
+			mark->cl = mark->cl->next;
+
+		if (mark->cl != &svc->destinations) {
+			/* not at the head of the list */
+			dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+			    atomic_read(&dest->weight) >= mark->cw) {
+				/* got it */
+				break;
+			}
+		}
+
+		if (mark->cl == p && mark->cw == mark->di) {
+			/* back to the start, and no dest is found.
+			   It is only possible when all dests are OVERLOADED */
+			dest = NULL;
+			goto out;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "WRR: server %s:%u "
+		      "activeconns %d refcnt %d weight %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+		      atomic_read(&dest->activeconns),
+		      atomic_read(&dest->refcnt),
+		      atomic_read(&dest->weight));
+
+  out:
+	write_unlock(&svc->sched_lock);
+	return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+	.name =			"wrr",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+	.supports_ipv6 =	1,
+#endif
+	.init_service =		ip_vs_wrr_init_svc,
+	.done_service =		ip_vs_wrr_done_svc,
+	.update_service =	ip_vs_wrr_update_svc,
+	.schedule =		ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
new file mode 100644
index 00000000000..02ddc2b3ce2
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -0,0 +1,1004 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>                  /* for ip_route_output */
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = dest->dst_cache;
+	dest->dst_cache = dst;
+	dest->dst_rtos = rtos;
+	dst_release(old_dst);
+}
+
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+{
+	struct dst_entry *dst = dest->dst_cache;
+
+	if (!dst)
+		return NULL;
+	if ((dst->obsolete
+	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
+	    dst->ops->check(dst, cookie) == NULL) {
+		dest->dst_cache = NULL;
+		dst_release(dst);
+		return NULL;
+	}
+	dst_hold(dst);
+	return dst;
+}
+
+static struct rtable *
+__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct ip_vs_dest *dest = cp->dest;
+
+	if (dest) {
+		spin_lock(&dest->dst_lock);
+		if (!(rt = (struct rtable *)
+		      __ip_vs_dst_check(dest, rtos, 0))) {
+			struct flowi fl = {
+				.oif = 0,
+				.nl_u = {
+					.ip4_u = {
+						.daddr = dest->addr.ip,
+						.saddr = 0,
+						.tos = rtos, } },
+			};
+
+			if (ip_route_output_key(&init_net, &rt, &fl)) {
+				spin_unlock(&dest->dst_lock);
+				IP_VS_DBG_RL("ip_route_output error, "
+					     "dest: %u.%u.%u.%u\n",
+					     NIPQUAD(dest->addr.ip));
+				return NULL;
+			}
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+			IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
+				  NIPQUAD(dest->addr.ip),
+				  atomic_read(&rt->u.dst.__refcnt), rtos);
+		}
+		spin_unlock(&dest->dst_lock);
+	} else {
+		struct flowi fl = {
+			.oif = 0,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = cp->daddr.ip,
+					.saddr = 0,
+					.tos = rtos, } },
+		};
+
+		if (ip_route_output_key(&init_net, &rt, &fl)) {
+			IP_VS_DBG_RL("ip_route_output error, dest: "
+				     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
+			return NULL;
+		}
+	}
+
+	return rt;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static struct rt6_info *
+__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	struct ip_vs_dest *dest = cp->dest;
+
+	if (dest) {
+		spin_lock(&dest->dst_lock);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		if (!rt) {
+			struct flowi fl = {
+				.oif = 0,
+				.nl_u = {
+					.ip6_u = {
+						.daddr = dest->addr.in6,
+						.saddr = {
+							.s6_addr32 =
+								{ 0, 0, 0, 0 },
+						},
+					},
+				},
+			};
+
+			rt = (struct rt6_info *)ip6_route_output(&init_net,
+								 NULL, &fl);
+			if (!rt) {
+				spin_unlock(&dest->dst_lock);
+				IP_VS_DBG_RL("ip6_route_output error, "
+					     "dest: " NIP6_FMT "\n",
+					     NIP6(dest->addr.in6));
+				return NULL;
+			}
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
+			IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
+				  NIP6(dest->addr.in6),
+				  atomic_read(&rt->u.dst.__refcnt));
+		}
+		spin_unlock(&dest->dst_lock);
+	} else {
+		struct flowi fl = {
+			.oif = 0,
+			.nl_u = {
+				.ip6_u = {
+					.daddr = cp->daddr.in6,
+					.saddr = {
+						.s6_addr32 = { 0, 0, 0, 0 },
+					},
+				},
+			},
+		};
+
+		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+		if (!rt) {
+			IP_VS_DBG_RL("ip6_route_output error, dest: "
+				     NIP6_FMT "\n", NIP6(cp->daddr.in6));
+			return NULL;
+		}
+	}
+
+	return rt;
+}
+#endif
+
+
+/*
+ *	Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = dest->dst_cache;
+	dest->dst_cache = NULL;
+	dst_release(old_dst);
+}
+
+#define IP_VS_XMIT(pf, skb, rt)				\
+do {							\
+	(skb)->ipvs_property = 1;			\
+	skb_forward_csum(skb);				\
+	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
+		(rt)->u.dst.dev, dst_output);		\
+} while (0)
+
+
+/*
+ *      NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp)
+{
+	/* we do not touch skb and do not need pskb ptr */
+	return NF_ACCEPT;
+}
+
+
+/*
+ *      Bypass transmitter
+ *      Let packets bypass the destination when the destination is not
+ *      available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = ip_hdr(skb);
+	u8     tos = iph->tos;
+	int    mtu;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip4_u = {
+				.daddr = iph->daddr,
+				.saddr = 0,
+				.tos = RT_TOS(tos), } },
+	};
+
+	EnterFunction(10);
+
+	if (ip_route_output_key(&init_net, &rt, &fl)) {
+		IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+			     "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+		goto tx_error_icmp;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+		ip_rt_put(rt);
+		return NF_STOLEN;
+	}
+	ip_send_check(ip_hdr(skb));
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+ tx_error_icmp:
+	dst_link_failure(skb);
+ tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		     struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	struct ipv6hdr  *iph = ipv6_hdr(skb);
+	int    mtu;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = iph->daddr,
+				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
+	};
+
+	EnterFunction(10);
+
+	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+	if (!rt) {
+		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
+			     "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
+		goto tx_error_icmp;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		dst_release(&rt->u.dst);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(skb == NULL)) {
+		dst_release(&rt->u.dst);
+		return NF_STOLEN;
+	}
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+ tx_error_icmp:
+	dst_link_failure(skb);
+ tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
+
+/*
+ *      NAT transmitter (only for outside-to-inside nat forwarding)
+ *      Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+	       struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;		/* Route to the other host */
+	int mtu;
+	struct iphdr *iph = ip_hdr(skb);
+
+	EnterFunction(10);
+
+	/* check if it is a connection of no-client-port */
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		__be16 _pt, *p;
+		p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+		if (p == NULL)
+			goto tx_error;
+		ip_vs_conn_fill_cport(cp, *p);
+		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+	}
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* mangle the packet */
+	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+		goto tx_error;
+	ip_hdr(skb)->daddr = cp->daddr.ip;
+	ip_send_check(ip_hdr(skb));
+
+	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+
+	/* FIXME: when application helper enlarges the packet and the length
+	   is larger than the MTU of outgoing device, there will be still
+	   MTU problem. */
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	LeaveFunction(10);
+	kfree_skb(skb);
+	return NF_STOLEN;
+  tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;		/* Route to the other host */
+	int mtu;
+
+	EnterFunction(10);
+
+	/* check if it is a connection of no-client-port */
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		__be16 _pt, *p;
+		p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
+				       sizeof(_pt), &_pt);
+		if (p == NULL)
+			goto tx_error;
+		ip_vs_conn_fill_cport(cp, *p);
+		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+	}
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		dst_release(&rt->u.dst);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				 "ip_vs_nat_xmit_v6(): frag needed for");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* mangle the packet */
+	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+		goto tx_error;
+	ipv6_hdr(skb)->daddr = cp->daddr.in6;
+
+	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+
+	/* FIXME: when application helper enlarges the packet and the length
+	   is larger than the MTU of outgoing device, there will be still
+	   MTU problem. */
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	LeaveFunction(10);
+	kfree_skb(skb);
+	return NF_STOLEN;
+tx_error_put:
+	dst_release(&rt->u.dst);
+	goto tx_error;
+}
+#endif
+
+
+/*
+ *   IP Tunneling transmitter
+ *
+ *   This function encapsulates the packet in a new IP packet, its
+ *   destination will be set to cp->daddr. Most code of this function
+ *   is taken from ipip.c.
+ *
+ *   It is used in VS/TUN cluster. The load balancer selects a real
+ *   server from a cluster based on a scheduling algorithm,
+ *   encapsulates the request packet and forwards it to the selected
+ *   server. For example, all real servers are configured with
+ *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ *   the encapsulated packet, it will decapsulate the packet, processe
+ *   the request and return the response packets directly to the client
+ *   without passing the load balancer. This can greatly increase the
+ *   scalability of virtual server.
+ *
+ *   Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct net_device *tdev;		/* Device to other host */
+	struct iphdr  *old_iph = ip_hdr(skb);
+	u8     tos = old_iph->tos;
+	__be16 df = old_iph->frag_off;
+	sk_buff_data_t old_transport_header = skb->transport_header;
+	struct iphdr  *iph;			/* Our new IP header */
+	unsigned int max_headroom;		/* The extra header space needed */
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (skb->protocol != htons(ETH_P_IP)) {
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+			     "ETH_P_IP: %d, skb protocol: %d\n",
+			     htons(ETH_P_IP), skb->protocol);
+		goto tx_error;
+	}
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+		goto tx_error_icmp;
+
+	tdev = rt->u.dst.dev;
+
+	mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+	if (mtu < 68) {
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+		goto tx_error;
+	}
+	if (skb->dst)
+		skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+	df |= (old_iph->frag_off & htons(IP_DF));
+
+	if ((old_iph->frag_off & htons(IP_DF))
+	    && mtu < ntohs(old_iph->tot_len)) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+	if (skb_headroom(skb) < max_headroom
+	    || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+			kfree_skb(skb);
+			IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+			return NF_STOLEN;
+		}
+		kfree_skb(skb);
+		skb = new_skb;
+		old_iph = ip_hdr(skb);
+	}
+
+	skb->transport_header = old_transport_header;
+
+	/* fix old IP header checksum */
+	ip_send_check(old_iph);
+
+	skb_push(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+	iph			=	ip_hdr(skb);
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr)>>2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_IPIP;
+	iph->tos		=	tos;
+	iph->daddr		=	rt->rt_dst;
+	iph->saddr		=	rt->rt_src;
+	iph->ttl		=	old_iph->ttl;
+	ip_select_ident(iph, &rt->u.dst, NULL);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	ip_local_out(skb);
+
+	LeaveFunction(10);
+
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		     struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;		/* Route to the other host */
+	struct net_device *tdev;	/* Device to other host */
+	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
+	sk_buff_data_t old_transport_header = skb->transport_header;
+	struct ipv6hdr  *iph;		/* Our new IP header */
+	unsigned int max_headroom;	/* The extra header space needed */
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (skb->protocol != htons(ETH_P_IPV6)) {
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
+			     "ETH_P_IPV6: %d, skb protocol: %d\n",
+			     htons(ETH_P_IPV6), skb->protocol);
+		goto tx_error;
+	}
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	tdev = rt->u.dst.dev;
+
+	mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
+	/* TODO IPv6: do we need this check in IPv6? */
+	if (mtu < 1280) {
+		dst_release(&rt->u.dst);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
+		goto tx_error;
+	}
+	if (skb->dst)
+		skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		dst_release(&rt->u.dst);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+
+	if (skb_headroom(skb) < max_headroom
+	    || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			dst_release(&rt->u.dst);
+			kfree_skb(skb);
+			IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
+			return NF_STOLEN;
+		}
+		kfree_skb(skb);
+		skb = new_skb;
+		old_iph = ipv6_hdr(skb);
+	}
+
+	skb->transport_header = old_transport_header;
+
+	skb_push(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+	iph			=	ipv6_hdr(skb);
+	iph->version		=	6;
+	iph->nexthdr		=	IPPROTO_IPV6;
+	iph->payload_len	=	old_iph->payload_len + sizeof(old_iph);
+	iph->priority		=	old_iph->priority;
+	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
+	iph->daddr		=	rt->rt6i_dst.addr;
+	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	iph->hop_limit		=	old_iph->hop_limit;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	ip6_local_out(skb);
+
+	LeaveFunction(10);
+
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
+
+
+/*
+ *      Direct Routing transmitter
+ *      Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+	      struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = ip_hdr(skb);
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+		ip_rt_put(rt);
+		return NF_STOLEN;
+	}
+	ip_send_check(ip_hdr(skb));
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		 struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	int    mtu;
+
+	EnterFunction(10);
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		dst_release(&rt->u.dst);
+		IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(skb == NULL)) {
+		dst_release(&rt->u.dst);
+		return NF_STOLEN;
+	}
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
+
+
+/*
+ *	ICMP packet transmitter
+ *	called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp, int offset)
+{
+	struct rtable	*rt;	/* Route to the other host */
+	int mtu;
+	int rc;
+
+	EnterFunction(10);
+
+	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+	   forwarded directly here, because there is no need to
+	   translate address/port back */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+		if (cp->packet_xmit)
+			rc = cp->packet_xmit(skb, cp, pp);
+		else
+			rc = NF_ACCEPT;
+		/* do not touch skb anymore */
+		atomic_inc(&cp->in_pkts);
+		goto out;
+	}
+
+	/*
+	 * mangle and send the packet here (only for VS/NAT)
+	 */
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, offset))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop the old route when skb is not shared */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	ip_vs_nat_icmp(skb, pp, cp, 0);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET, skb, rt);
+
+	rc = NF_STOLEN;
+	goto out;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	dev_kfree_skb(skb);
+	rc = NF_STOLEN;
+  out:
+	LeaveFunction(10);
+	return rc;
+  tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp, int offset)
+{
+	struct rt6_info	*rt;	/* Route to the other host */
+	int mtu;
+	int rc;
+
+	EnterFunction(10);
+
+	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+	   forwarded directly here, because there is no need to
+	   translate address/port back */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+		if (cp->packet_xmit)
+			rc = cp->packet_xmit(skb, cp, pp);
+		else
+			rc = NF_ACCEPT;
+		/* do not touch skb anymore */
+		atomic_inc(&cp->in_pkts);
+		goto out;
+	}
+
+	/*
+	 * mangle and send the packet here (only for VS/NAT)
+	 */
+
+	rt = __ip_vs_get_out_rt_v6(cp);
+	if (!rt)
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if (skb->len > mtu) {
+		dst_release(&rt->u.dst);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, offset))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop the old route when skb is not shared */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(PF_INET6, skb, rt);
+
+	rc = NF_STOLEN;
+	goto out;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	dev_kfree_skb(skb);
+	rc = NF_STOLEN;
+out:
+	LeaveFunction(10);
+	return rc;
+tx_error_put:
+	dst_release(&rt->u.dst);
+	goto tx_error;
+}
+#endif
-- 
cgit v1.2.3


From 74af025073461b9ebe82771e48a5b8596c3cf75c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 5 Sep 2008 12:38:09 -0700
Subject: wireless: restore revert lost to merge damage

Restore revert "mac80211: Use IWEVASSOCREQIE instead of IWEVCUSTOM",
originally reverted in commit bf7394ccc13fe291d9258f01113b4c61214ddeae.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 47 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e859a0ab616..edc339d649c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -650,20 +650,51 @@ static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata,
 static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
 					 struct ieee80211_if_sta *ifsta)
 {
+	char *buf;
+	size_t len;
+	int i;
 	union iwreq_data wrqu;
 
+	if (!ifsta->assocreq_ies && !ifsta->assocresp_ies)
+		return;
+
+	buf = kmalloc(50 + 2 * (ifsta->assocreq_ies_len +
+				ifsta->assocresp_ies_len), GFP_KERNEL);
+	if (!buf)
+		return;
+
+	len = sprintf(buf, "ASSOCINFO(");
 	if (ifsta->assocreq_ies) {
-		memset(&wrqu, 0, sizeof(wrqu));
-		wrqu.data.length = ifsta->assocreq_ies_len;
-		wireless_send_event(sdata->dev, IWEVASSOCREQIE, &wrqu,
-				    ifsta->assocreq_ies);
+		len += sprintf(buf + len, "ReqIEs=");
+		for (i = 0; i < ifsta->assocreq_ies_len; i++) {
+			len += sprintf(buf + len, "%02x",
+				       ifsta->assocreq_ies[i]);
+		}
 	}
 	if (ifsta->assocresp_ies) {
-		memset(&wrqu, 0, sizeof(wrqu));
-		wrqu.data.length = ifsta->assocresp_ies_len;
-		wireless_send_event(sdata->dev, IWEVASSOCRESPIE, &wrqu,
-				    ifsta->assocresp_ies);
+		if (ifsta->assocreq_ies)
+			len += sprintf(buf + len, " ");
+		len += sprintf(buf + len, "RespIEs=");
+		for (i = 0; i < ifsta->assocresp_ies_len; i++) {
+			len += sprintf(buf + len, "%02x",
+				       ifsta->assocresp_ies[i]);
+		}
 	}
+	len += sprintf(buf + len, ")");
+
+	if (len > IW_CUSTOM_MAX) {
+		len = sprintf(buf, "ASSOCRESPIE=");
+		for (i = 0; i < ifsta->assocresp_ies_len; i++) {
+			len += sprintf(buf + len, "%02x",
+				       ifsta->assocresp_ies[i]);
+		}
+	}
+
+	memset(&wrqu, 0, sizeof(wrqu));
+	wrqu.data.length = len;
+	wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
+
+	kfree(buf);
 }
 
 
-- 
cgit v1.2.3


From 8ef9dad3f7c0bdae84cd57f2bc6d4f53421406a8 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Sat, 27 Sep 2008 22:58:18 +0300
Subject: mac80211: remove shadowed variables in ieee80211_master_start_xmit

This patch removes doubly defined variables in ieee80211_master_start_xmit

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/tx.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 0cc2e23f082..226ce77363d 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1255,8 +1255,7 @@ static int ieee80211_skb_resize(struct ieee80211_local *local,
 	return 0;
 }
 
-int ieee80211_master_start_xmit(struct sk_buff *skb,
-				struct net_device *dev)
+int ieee80211_master_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
 	struct ieee80211_local *local = mpriv->local;
@@ -1308,8 +1307,6 @@ int ieee80211_master_start_xmit(struct sk_buff *skb,
 		}
 	} else if (unlikely(osdata->vif.type == NL80211_IFTYPE_MONITOR)) {
 		struct ieee80211_sub_if_data *sdata;
-		struct ieee80211_local *local = osdata->local;
-		struct ieee80211_hdr *hdr;
 		int hdrlen;
 		u16 len_rthdr;
 
-- 
cgit v1.2.3


From 5d6ffc533678c936e366809acaff8401af43a4af Mon Sep 17 00:00:00 2001
From: Davide Pesavento <davidepesa@gmail.com>
Date: Tue, 30 Sep 2008 19:56:34 +0200
Subject: wireless: fix typo in Kconfig.

Signed-off-by: Davide Pesavento <davidepesa@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index b97bd9fe6b7..7d82be07fa1 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -15,7 +15,7 @@ config NL80211
 	  If unsure, say Y.
 
 config WIRELESS_OLD_REGULATORY
-	bool "Old wireless static regulatory defintions"
+	bool "Old wireless static regulatory definitions"
 	default n
 	---help---
 	  This option enables the old static regulatory information
-- 
cgit v1.2.3


From f74b6a5498049bab28419a03e4b31fcdbe7a900d Mon Sep 17 00:00:00 2001
From: Rami Rosen <roszenrami@gmail.com>
Date: Thu, 2 Oct 2008 16:48:22 +0300
Subject: mac80211: remove redundant check in ieee80211_master_start_xmit
 (net/mac80211/tx.c)

 - This patch (against the linux-wireless-next git tree) removes a
redundant check in ieee80211_master_start_xmit (net/mac80211/tx.c)
and adjust indentation in this method accordingly.

 In this method, there is no need to call again the
ieee80211_is_data() method; this is checked immediately before, in the
"if" command (we will not enter this block unless ieee80211_is_data()
is true, so that the "and" (&&) condition in that "if" command will be
fullfilled ).

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/tx.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 226ce77363d..d7153bbcdb2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1295,16 +1295,14 @@ int ieee80211_master_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (ieee80211_vif_is_mesh(&osdata->vif) &&
 	    ieee80211_is_data(hdr->frame_control)) {
-		if (ieee80211_is_data(hdr->frame_control)) {
-			if (is_multicast_ether_addr(hdr->addr3))
-				memcpy(hdr->addr1, hdr->addr3, ETH_ALEN);
-			else
-				if (mesh_nexthop_lookup(skb, osdata))
-					return  0;
-			if (memcmp(odev->dev_addr, hdr->addr4, ETH_ALEN) != 0)
-				IEEE80211_IFSTA_MESH_CTR_INC(&osdata->u.mesh,
-							     fwded_frames);
-		}
+		if (is_multicast_ether_addr(hdr->addr3))
+			memcpy(hdr->addr1, hdr->addr3, ETH_ALEN);
+		else
+			if (mesh_nexthop_lookup(skb, osdata))
+				return  0;
+		if (memcmp(odev->dev_addr, hdr->addr4, ETH_ALEN) != 0)
+			IEEE80211_IFSTA_MESH_CTR_INC(&osdata->u.mesh,
+							    fwded_frames);
 	} else if (unlikely(osdata->vif.type == NL80211_IFTYPE_MONITOR)) {
 		struct ieee80211_sub_if_data *sdata;
 		int hdrlen;
-- 
cgit v1.2.3


From 417bd25ac4c6f76c8aafe8a584f3620f4a936b72 Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Fri, 3 Oct 2008 16:58:05 -0300
Subject: rfkill: update LEDs for all state changes

The LED state was not being updated by rfkill_force_state(), which
will cause regressions in wireless drivers that had old-style rfkill
support and are updated to use rfkill_force_state().

The LED state was not being updated when a change was detected through
the rfkill->get_state() hook, either.

Move the LED trigger update calls into notify_rfkill_state_change(),
where it should have been in the first place.  This takes care of both
issues above.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Cc: stable@kernel.org
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/rfkill/rfkill.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index ea0dc04b3c7..f949a482b00 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -125,6 +125,7 @@ static void rfkill_led_trigger_activate(struct led_classdev *led)
 
 static void notify_rfkill_state_change(struct rfkill *rfkill)
 {
+	rfkill_led_trigger(rfkill, rfkill->state);
 	blocking_notifier_call_chain(&rfkill_notifier_list,
 			RFKILL_STATE_CHANGED,
 			rfkill);
@@ -217,10 +218,8 @@ static int rfkill_toggle_radio(struct rfkill *rfkill,
 			rfkill->state = state;
 	}
 
-	if (force || rfkill->state != oldstate) {
-		rfkill_led_trigger(rfkill, rfkill->state);
+	if (force || rfkill->state != oldstate)
 		notify_rfkill_state_change(rfkill);
-	}
 
 	return retval;
 }
-- 
cgit v1.2.3


From 76708dee382a69b2f9d0e50f413f99fefb2dc509 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sun, 5 Oct 2008 18:02:48 +0200
Subject: mac80211: free up 2 bytes in skb->cb

Free up 2 bytes in skb->cb to be used for multi-rate retry later.
Move iv_len and icv_len initialization into key alloc.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/key.c | 14 ++++++++++++++
 net/mac80211/wep.c |  3 ---
 net/mac80211/wpa.c |  6 ------
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 57afcd38cd9..a5b06fe7198 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -281,6 +281,20 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 	key->conf.alg = alg;
 	key->conf.keyidx = idx;
 	key->conf.keylen = key_len;
+	switch (alg) {
+	case ALG_WEP:
+		key->conf.iv_len = WEP_IV_LEN;
+		key->conf.icv_len = WEP_ICV_LEN;
+		break;
+	case ALG_TKIP:
+		key->conf.iv_len = TKIP_IV_LEN;
+		key->conf.icv_len = TKIP_ICV_LEN;
+		break;
+	case ALG_CCMP:
+		key->conf.iv_len = CCMP_HDR_LEN;
+		key->conf.icv_len = CCMP_MIC_LEN;
+		break;
+	}
 	memcpy(key->conf.key, key_data, key_len);
 	INIT_LIST_HEAD(&key->list);
 	INIT_LIST_HEAD(&key->todo);
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 376c84987e4..f0e2d3ecb5c 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -313,9 +313,6 @@ static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
-	info->control.iv_len = WEP_IV_LEN;
-	info->control.icv_len = WEP_ICV_LEN;
-
 	if (!(tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) {
 		if (ieee80211_wep_encrypt(tx->local, skb, tx->key))
 			return -1;
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 37ae9a959f6..6db649480e8 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -152,9 +152,6 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 	int len, tail;
 	u8 *pos;
 
-	info->control.icv_len = TKIP_ICV_LEN;
-	info->control.iv_len = TKIP_IV_LEN;
-
 	if ((tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) &&
 	    !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
 		/* hwaccel - with no need for preallocated room for IV/ICV */
@@ -374,9 +371,6 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 	u8 *pos, *pn;
 	int i;
 
-	info->control.icv_len = CCMP_MIC_LEN;
-	info->control.iv_len = CCMP_HDR_LEN;
-
 	if ((tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) &&
 	    !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
 		/* hwaccel - with no need for preallocated room for CCMP "
-- 
cgit v1.2.3


From 870abdf67170daa9f1022e55a35c469239fcc74c Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sun, 5 Oct 2008 18:04:24 +0200
Subject: mac80211: add multi-rate retry support

This patch adjusts the rate control API to allow multi-rate retry
if supported by the driver. The ieee80211_hw struct specifies how
many alternate rate selections the driver supports.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/tx.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index d7153bbcdb2..1460537faf3 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -454,15 +454,16 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 		if (unlikely(rsel.probe_idx >= 0)) {
 			info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
 			tx->flags |= IEEE80211_TX_PROBE_LAST_FRAG;
-			info->control.alt_retry_rate_idx = tx->rate_idx;
+			info->control.retries[0].rate_idx = tx->rate_idx;
+			info->control.retries[0].limit = tx->local->hw.max_altrate_tries;
 			tx->rate_idx = rsel.probe_idx;
-		} else
-			info->control.alt_retry_rate_idx = -1;
+		} else if (info->control.retries[0].limit == 0)
+			info->control.retries[0].rate_idx = -1;
 
 		if (unlikely(tx->rate_idx < 0))
 			return TX_DROP;
 	} else
-		info->control.alt_retry_rate_idx = -1;
+		info->control.retries[0].rate_idx = -1;
 
 	if (tx->sdata->bss_conf.use_cts_prot &&
 	    (tx->flags & IEEE80211_TX_FRAGMENTED) && (rsel.nonerp_idx >= 0)) {
@@ -521,7 +522,7 @@ ieee80211_tx_h_misc(struct ieee80211_tx_data *tx)
 		 * frames.
 		 * TODO: The last fragment could still use multiple retry
 		 * rates. */
-		info->control.alt_retry_rate_idx = -1;
+		info->control.retries[0].rate_idx = -1;
 	}
 
 	/* Use CTS protection for unicast frames sent using extended rates if
@@ -551,7 +552,7 @@ ieee80211_tx_h_misc(struct ieee80211_tx_data *tx)
 		int idx;
 
 		/* Do not use multiple retry rates when using RTS/CTS */
-		info->control.alt_retry_rate_idx = -1;
+		info->control.retries[0].rate_idx = -1;
 
 		/* Use min(data rate, max base rate) as CTS/RTS rate */
 		rate = &sband->bitrates[tx->rate_idx];
-- 
cgit v1.2.3


From cccf129f820e431d84690729254a32f1709328fb Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sun, 5 Oct 2008 18:07:45 +0200
Subject: mac80211: add the 'minstrel' rate control algorithm

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/Kconfig                    |  13 +
 net/mac80211/Makefile                   |   4 +
 net/mac80211/main.c                     |   5 +
 net/mac80211/rate.h                     |  14 +
 net/mac80211/rc80211_minstrel.c         | 583 ++++++++++++++++++++++++++++++++
 net/mac80211/rc80211_minstrel.h         |  85 +++++
 net/mac80211/rc80211_minstrel_debugfs.c | 164 +++++++++
 7 files changed, 868 insertions(+)
 create mode 100644 net/mac80211/rc80211_minstrel.c
 create mode 100644 net/mac80211/rc80211_minstrel.h
 create mode 100644 net/mac80211/rc80211_minstrel_debugfs.c

(limited to 'net')

diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 8427518e4f2..7f710a27e91 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -22,6 +22,11 @@ config MAC80211_RC_PID
 	  mac80211 that uses a PID controller to select the TX
 	  rate.
 
+config MAC80211_RC_MINSTREL
+	bool "Minstrel"
+	---help---
+	  This option enables the 'minstrel' TX rate control algorithm
+
 choice
 	prompt "Default rate control algorithm"
 	default MAC80211_RC_DEFAULT_PID
@@ -39,11 +44,19 @@ config MAC80211_RC_DEFAULT_PID
 	  default rate control algorithm. You should choose
 	  this unless you know what you are doing.
 
+config MAC80211_RC_DEFAULT_MINSTREL
+	bool "Minstrel"
+	depends on MAC80211_RC_MINSTREL
+	---help---
+	  Select Minstrel as the default rate control algorithm.
+
+
 endchoice
 
 config MAC80211_RC_DEFAULT
 	string
 	default "pid" if MAC80211_RC_DEFAULT_PID
+	default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL
 	default ""
 
 endmenu
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 2dc8f2bff27..31cfd1f89a7 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -41,4 +41,8 @@ mac80211-$(CONFIG_MAC80211_MESH) += \
 rc80211_pid-y := rc80211_pid_algo.o
 rc80211_pid-$(CONFIG_MAC80211_DEBUGFS) += rc80211_pid_debugfs.o
 
+rc80211_minstrel-y := rc80211_minstrel.o
+rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o
+
 mac80211-$(CONFIG_MAC80211_RC_PID) += $(rc80211_pid-y)
+mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index d608c44047c..ae62ad40ad6 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1015,6 +1015,10 @@ static int __init ieee80211_init(void)
 	BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, driver_data) +
 	             IEEE80211_TX_INFO_DRIVER_DATA_SIZE > sizeof(skb->cb));
 
+	ret = rc80211_minstrel_init();
+	if (ret)
+		return ret;
+
 	ret = rc80211_pid_init();
 	if (ret)
 		return ret;
@@ -1027,6 +1031,7 @@ static int __init ieee80211_init(void)
 static void __exit ieee80211_exit(void)
 {
 	rc80211_pid_exit();
+	rc80211_minstrel_exit();
 
 	/*
 	 * For key todo, it'll be empty by now but the work
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index eb94e584d24..d0092f847f8 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -125,4 +125,18 @@ static inline void rc80211_pid_exit(void)
 }
 #endif
 
+#ifdef CONFIG_MAC80211_RC_MINSTREL
+extern int rc80211_minstrel_init(void);
+extern void rc80211_minstrel_exit(void);
+#else
+static inline int rc80211_minstrel_init(void)
+{
+	return 0;
+}
+static inline void rc80211_minstrel_exit(void)
+{
+}
+#endif
+
+
 #endif /* IEEE80211_RATE_H */
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
new file mode 100644
index 00000000000..f6d69dab07a
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on minstrel.c:
+ *   Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz>
+ *   Sponsored by Indranet Technologies Ltd
+ *
+ * Based on sample.c:
+ *   Copyright (c) 2005 John Bicket
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer,
+ *      without modification.
+ *   2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *      similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *      redistribution must be conditioned upon including a substantially
+ *      similar Disclaimer requirement for further binary redistribution.
+ *   3. Neither the names of the above-listed copyright holders nor the names
+ *      of any contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *   Alternatively, this software may be distributed under the terms of the
+ *   GNU General Public License ("GPL") version 2 as published by the Free
+ *   Software Foundation.
+ *
+ *   NO WARRANTY
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ *   AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ *   THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ *   OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ *   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGES.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "rate.h"
+#include "rc80211_minstrel.h"
+
+#define SAMPLE_COLUMNS	10
+#define SAMPLE_TBL(_mi, _idx, _col) \
+		_mi->sample_table[(_idx * SAMPLE_COLUMNS) + _col]
+
+/* convert mac80211 rate index to local array index */
+static inline int
+rix_to_ndx(struct minstrel_sta_info *mi, int rix)
+{
+	int i = rix;
+	for (i = rix; i >= 0; i--)
+		if (mi->r[i].rix == rix)
+			break;
+	WARN_ON(mi->r[i].rix != rix);
+	return i;
+}
+
+static inline bool
+use_low_rate(struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	u16 fc;
+
+	fc = le16_to_cpu(hdr->frame_control);
+
+	return ((info->flags & IEEE80211_TX_CTL_NO_ACK) ||
+		(fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA ||
+		is_multicast_ether_addr(hdr->addr1));
+}
+
+
+static void
+minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
+{
+	u32 max_tp = 0, index_max_tp = 0, index_max_tp2 = 0;
+	u32 max_prob = 0, index_max_prob = 0;
+	u32 usecs;
+	u32 p;
+	int i;
+
+	mi->stats_update = jiffies;
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+
+		usecs = mr->perfect_tx_time;
+		if (!usecs)
+			usecs = 1000000;
+
+		/* To avoid rounding issues, probabilities scale from 0 (0%)
+		 * to 18000 (100%) */
+		if (mr->attempts) {
+			p = (mr->success * 18000) / mr->attempts;
+			mr->succ_hist += mr->success;
+			mr->att_hist += mr->attempts;
+			mr->cur_prob = p;
+			p = ((p * (100 - mp->ewma_level)) + (mr->probability *
+				mp->ewma_level)) / 100;
+			mr->probability = p;
+			mr->cur_tp = p * (1000000 / usecs);
+		}
+
+		mr->last_success = mr->success;
+		mr->last_attempts = mr->attempts;
+		mr->success = 0;
+		mr->attempts = 0;
+
+		/* Sample less often below the 10% chance of success.
+		 * Sample less often above the 95% chance of success. */
+		if ((mr->probability > 17100) || (mr->probability < 1800)) {
+			mr->adjusted_retry_count = mr->retry_count >> 1;
+			if (mr->adjusted_retry_count > 2)
+				mr->adjusted_retry_count = 2;
+		} else {
+			mr->adjusted_retry_count = mr->retry_count;
+		}
+		if (!mr->adjusted_retry_count)
+			mr->adjusted_retry_count = 2;
+	}
+
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+		if (max_tp < mr->cur_tp) {
+			index_max_tp = i;
+			max_tp = mr->cur_tp;
+		}
+		if (max_prob < mr->probability) {
+			index_max_prob = i;
+			max_prob = mr->probability;
+		}
+	}
+
+	max_tp = 0;
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+
+		if (i == index_max_tp)
+			continue;
+
+		if (max_tp < mr->cur_tp) {
+			index_max_tp2 = i;
+			max_tp = mr->cur_tp;
+		}
+	}
+	mi->max_tp_rate = index_max_tp;
+	mi->max_tp_rate2 = index_max_tp2;
+	mi->max_prob_rate = index_max_prob;
+}
+
+static void
+minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband,
+                   struct ieee80211_sta *sta, void *priv_sta,
+		   struct sk_buff *skb)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_tx_altrate *ar = info->status.retries;
+	struct minstrel_priv *mp = priv;
+	int i, ndx, tries;
+	int success = 0;
+
+	if (!info->status.excessive_retries)
+		success = 1;
+
+	if (!mp->has_mrr || (ar[0].rate_idx < 0)) {
+		ndx = rix_to_ndx(mi, info->tx_rate_idx);
+		tries = info->status.retry_count + 1;
+		mi->r[ndx].success += success;
+		mi->r[ndx].attempts += tries;
+		return;
+	}
+
+	for (i = 0; i < 4; i++) {
+		if (ar[i].rate_idx < 0)
+			break;
+
+		ndx = rix_to_ndx(mi, ar[i].rate_idx);
+		mi->r[ndx].attempts += ar[i].limit + 1;
+
+		if ((i != 3) && (ar[i + 1].rate_idx < 0))
+			mi->r[ndx].success += success;
+	}
+
+	if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0))
+		mi->sample_count++;
+
+	if (mi->sample_deferred > 0)
+		mi->sample_deferred--;
+}
+
+
+static inline unsigned int
+minstrel_get_retry_count(struct minstrel_rate *mr,
+                         struct ieee80211_tx_info *info)
+{
+	unsigned int retry = mr->adjusted_retry_count;
+
+	if (info->flags & IEEE80211_TX_CTL_USE_RTS_CTS)
+		retry = max(2U, min(mr->retry_count_rtscts, retry));
+	else if (info->flags & IEEE80211_TX_CTL_USE_CTS_PROTECT)
+		retry = max(2U, min(mr->retry_count_cts, retry));
+	return retry;
+}
+
+
+static int
+minstrel_get_next_sample(struct minstrel_sta_info *mi)
+{
+	unsigned int sample_ndx;
+	sample_ndx = SAMPLE_TBL(mi, mi->sample_idx, mi->sample_column);
+	mi->sample_idx++;
+	if (mi->sample_idx > (mi->n_rates - 2)) {
+		mi->sample_idx = 0;
+		mi->sample_column++;
+		if (mi->sample_column >= SAMPLE_COLUMNS)
+			mi->sample_column = 0;
+	}
+	return sample_ndx;
+}
+
+void
+minstrel_get_rate(void *priv, struct ieee80211_supported_band *sband,
+                  struct ieee80211_sta *sta, void *priv_sta,
+                  struct sk_buff *skb, struct rate_selection *sel)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct minstrel_sta_info *mi = priv_sta;
+	struct minstrel_priv *mp = priv;
+	struct ieee80211_tx_altrate *ar = info->control.retries;
+	unsigned int ndx, sample_ndx = 0;
+	bool mrr;
+	bool sample_slower = false;
+	bool sample = false;
+	int i, delta;
+	int mrr_ndx[3];
+	int sample_rate;
+
+	if (!sta || !mi || use_low_rate(skb)) {
+		sel->rate_idx = rate_lowest_index(sband, sta);
+		return;
+	}
+
+	mrr = mp->has_mrr;
+
+	/* mac80211 does not allow mrr for RTS/CTS */
+	if ((info->flags & IEEE80211_TX_CTL_USE_RTS_CTS) ||
+	    (info->flags & IEEE80211_TX_CTL_USE_CTS_PROTECT))
+		mrr = false;
+
+	if (time_after(jiffies, mi->stats_update + (mp->update_interval *
+			HZ) / 1000))
+		minstrel_update_stats(mp, mi);
+
+	ndx = mi->max_tp_rate;
+
+	if (mrr)
+		sample_rate = mp->lookaround_rate_mrr;
+	else
+		sample_rate = mp->lookaround_rate;
+
+	mi->packet_count++;
+	delta = (mi->packet_count * sample_rate / 100) -
+			(mi->sample_count + mi->sample_deferred / 2);
+
+	/* delta > 0: sampling required */
+	if (delta > 0) {
+		if (mi->packet_count >= 10000) {
+			mi->sample_deferred = 0;
+			mi->sample_count = 0;
+			mi->packet_count = 0;
+		} else if (delta > mi->n_rates * 2) {
+			/* With multi-rate retry, not every planned sample
+			 * attempt actually gets used, due to the way the retry
+			 * chain is set up - [max_tp,sample,prob,lowest] for
+			 * sample_rate < max_tp.
+			 *
+			 * If there's too much sampling backlog and the link
+			 * starts getting worse, minstrel would start bursting
+			 * out lots of sampling frames, which would result
+			 * in a large throughput loss. */
+			mi->sample_count += (delta - mi->n_rates * 2);
+		}
+
+		sample_ndx = minstrel_get_next_sample(mi);
+		sample = true;
+		sample_slower = mrr && (mi->r[sample_ndx].perfect_tx_time >
+			mi->r[ndx].perfect_tx_time);
+
+		if (!sample_slower) {
+			ndx = sample_ndx;
+			mi->sample_count++;
+		} else {
+			/* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark
+			 * packets that have the sampling rate deferred to the
+			 * second MRR stage. Increase the sample counter only
+			 * if the deferred sample rate was actually used.
+			 * Use the sample_deferred counter to make sure that
+			 * the sampling is not done in large bursts */
+			info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+			mi->sample_deferred++;
+		}
+	}
+	sel->rate_idx = mi->r[ndx].rix;
+	info->control.retry_limit = minstrel_get_retry_count(&mi->r[ndx], info);
+
+	if (!mrr) {
+		ar[0].rate_idx = mi->lowest_rix;
+		ar[0].limit = mp->max_retry;
+		ar[1].rate_idx = -1;
+		return;
+	}
+
+	/* MRR setup */
+	if (sample) {
+		if (sample_slower)
+			mrr_ndx[0] = sample_ndx;
+		else
+			mrr_ndx[0] = mi->max_tp_rate;
+	} else {
+		mrr_ndx[0] = mi->max_tp_rate2;
+	}
+	mrr_ndx[1] = mi->max_prob_rate;
+	mrr_ndx[2] = 0;
+	for (i = 0; i < 3; i++) {
+		ar[i].rate_idx = mi->r[mrr_ndx[i]].rix;
+		ar[i].limit = mi->r[mrr_ndx[i]].adjusted_retry_count;
+	}
+}
+
+
+static void
+calc_rate_durations(struct minstrel_sta_info *mi, struct ieee80211_local *local,
+                    struct minstrel_rate *d, struct ieee80211_rate *rate)
+{
+	int erp = !!(rate->flags & IEEE80211_RATE_ERP_G);
+
+	d->perfect_tx_time = ieee80211_frame_duration(local, 1200,
+			rate->bitrate, erp, 1);
+	d->ack_time = ieee80211_frame_duration(local, 10,
+			rate->bitrate, erp, 1);
+}
+
+static void
+init_sample_table(struct minstrel_sta_info *mi)
+{
+	unsigned int i, col, new_idx;
+	unsigned int n_srates = mi->n_rates - 1;
+	u8 rnd[8];
+
+	mi->sample_column = 0;
+	mi->sample_idx = 0;
+	memset(mi->sample_table, 0, SAMPLE_COLUMNS * mi->n_rates);
+
+	for (col = 0; col < SAMPLE_COLUMNS; col++) {
+		for (i = 0; i < n_srates; i++) {
+			get_random_bytes(rnd, sizeof(rnd));
+			new_idx = (i + rnd[i & 7]) % n_srates;
+
+			while (SAMPLE_TBL(mi, new_idx, col) != 0)
+				new_idx = (new_idx + 1) % n_srates;
+
+			/* Don't sample the slowest rate (i.e. slowest base
+			 * rate). We must presume that the slowest rate works
+			 * fine, or else other management frames will also be
+			 * failing and the link will break */
+			SAMPLE_TBL(mi, new_idx, col) = i + 1;
+		}
+	}
+}
+
+static void
+minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
+               struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+	struct minstrel_priv *mp = priv;
+	struct minstrel_rate *mr_ctl;
+	unsigned int i, n = 0;
+	unsigned int t_slot = 9; /* FIXME: get real slot time */
+
+	mi->lowest_rix = rate_lowest_index(sband, sta);
+	mr_ctl = &mi->r[rix_to_ndx(mi, mi->lowest_rix)];
+	mi->sp_ack_dur = mr_ctl->ack_time;
+
+	for (i = 0; i < sband->n_bitrates; i++) {
+		struct minstrel_rate *mr = &mi->r[n];
+		unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0;
+		unsigned int tx_time_single;
+		unsigned int cw = mp->cw_min;
+
+		if (!rate_supported(sta, sband->band, i))
+			continue;
+		n++;
+		memset(mr, 0, sizeof(*mr));
+
+		mr->rix = i;
+		mr->bitrate = sband->bitrates[i].bitrate / 5;
+		calc_rate_durations(mi, hw_to_local(mp->hw), mr,
+				&sband->bitrates[i]);
+
+		/* calculate maximum number of retransmissions before
+		 * fallback (based on maximum segment size) */
+		mr->retry_count = 1;
+		mr->retry_count_cts = 1;
+		mr->retry_count_rtscts = 1;
+		tx_time = mr->perfect_tx_time + mi->sp_ack_dur;
+		do {
+			/* add one retransmission */
+			tx_time_single = mr->ack_time + mr->perfect_tx_time;
+
+			/* contention window */
+			tx_time_single += t_slot + min(cw, mp->cw_max);
+			cw = (cw + 1) << 1;
+
+			tx_time += tx_time_single;
+			tx_time_cts += tx_time_single + mi->sp_ack_dur;
+			tx_time_rtscts += tx_time_single + 2 * mi->sp_ack_dur;
+			if ((tx_time_cts < mp->segment_size) &&
+				(mr->retry_count_cts < mp->max_retry))
+				mr->retry_count_cts++;
+			if ((tx_time_rtscts < mp->segment_size) &&
+				(mr->retry_count_rtscts < mp->max_retry))
+				mr->retry_count_rtscts++;
+		} while ((tx_time < mp->segment_size) &&
+				(++mr->retry_count < mp->max_retry));
+		mr->adjusted_retry_count = mr->retry_count;
+	}
+
+	for (i = n; i < sband->n_bitrates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+		mr->rix = -1;
+	}
+
+	mi->n_rates = n;
+	mi->stats_update = jiffies;
+
+	init_sample_table(mi);
+}
+
+static void *
+minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
+{
+	struct ieee80211_supported_band *sband;
+	struct minstrel_sta_info *mi;
+	struct minstrel_priv *mp = priv;
+	struct ieee80211_hw *hw = mp->hw;
+	int max_rates = 0;
+	int i;
+
+	mi = kzalloc(sizeof(struct minstrel_sta_info), gfp);
+	if (!mi)
+		return NULL;
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		sband = hw->wiphy->bands[hw->conf.channel->band];
+		if (sband->n_bitrates > max_rates)
+			max_rates = sband->n_bitrates;
+	}
+
+	mi->r = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp);
+	if (!mi->r)
+		goto error;
+
+	mi->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp);
+	if (!mi->sample_table)
+		goto error1;
+
+	mi->stats_update = jiffies;
+	return mi;
+
+error1:
+	kfree(mi->r);
+error:
+	kfree(mi);
+	return NULL;
+}
+
+static void
+minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+
+	kfree(mi->sample_table);
+	kfree(mi->r);
+	kfree(mi);
+}
+
+static void
+minstrel_clear(void *priv)
+{
+}
+
+static void *
+minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
+{
+	struct minstrel_priv *mp;
+
+	mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC);
+	if (!mp)
+		return NULL;
+
+	/* contention window settings
+	 * Just an approximation. Using the per-queue values would complicate
+	 * the calculations and is probably unnecessary */
+	mp->cw_min = 15;
+	mp->cw_max = 1023;
+
+	/* number of packets (in %) to use for sampling other rates
+	 * sample less often for non-mrr packets, because the overhead
+	 * is much higher than with mrr */
+	mp->lookaround_rate = 5;
+	mp->lookaround_rate_mrr = 10;
+
+	/* moving average weight for EWMA */
+	mp->ewma_level = 75;
+
+	/* maximum time that the hw is allowed to stay in one MRR segment */
+	mp->segment_size = 6000;
+
+	if (hw->max_altrate_tries > 0)
+		mp->max_retry = hw->max_altrate_tries;
+	else
+		/* safe default, does not necessarily have to match hw properties */
+		mp->max_retry = 7;
+
+	if (hw->max_altrates >= 3)
+		mp->has_mrr = true;
+
+	mp->hw = hw;
+	mp->update_interval = 100;
+
+	return mp;
+}
+
+static void
+minstrel_free(void *priv)
+{
+	kfree(priv);
+}
+
+static struct rate_control_ops mac80211_minstrel = {
+	.name = "minstrel",
+	.tx_status = minstrel_tx_status,
+	.get_rate = minstrel_get_rate,
+	.rate_init = minstrel_rate_init,
+	.clear = minstrel_clear,
+	.alloc = minstrel_alloc,
+	.free = minstrel_free,
+	.alloc_sta = minstrel_alloc_sta,
+	.free_sta = minstrel_free_sta,
+#ifdef CONFIG_MAC80211_DEBUGFS
+	.add_sta_debugfs = minstrel_add_sta_debugfs,
+	.remove_sta_debugfs = minstrel_remove_sta_debugfs,
+#endif
+};
+
+int __init
+rc80211_minstrel_init(void)
+{
+	return ieee80211_rate_control_register(&mac80211_minstrel);
+}
+
+void
+rc80211_minstrel_exit(void)
+{
+	ieee80211_rate_control_unregister(&mac80211_minstrel);
+}
+
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
new file mode 100644
index 00000000000..9a90a6aee04
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __RC_MINSTREL_H
+#define __RC_MINSTREL_H
+
+struct minstrel_rate {
+	int bitrate;
+	int rix;
+
+	unsigned int perfect_tx_time;
+	unsigned int ack_time;
+
+	unsigned int retry_count;
+	unsigned int retry_count_cts;
+	unsigned int retry_count_rtscts;
+	unsigned int adjusted_retry_count;
+
+	u32 success;
+	u32 attempts;
+	u32 last_attempts;
+	u32 last_success;
+
+	/* parts per thousand */
+	u32 cur_prob;
+	u32 probability;
+
+	/* per-rate throughput */
+	u32 cur_tp;
+	u32 throughput;
+
+	u64 succ_hist;
+	u64 att_hist;
+};
+
+struct minstrel_sta_info {
+	unsigned long stats_update;
+	unsigned int sp_ack_dur;
+	unsigned int rate_avg;
+
+	unsigned int lowest_rix;
+
+	unsigned int max_tp_rate;
+	unsigned int max_tp_rate2;
+	unsigned int max_prob_rate;
+	unsigned int packet_count;
+	unsigned int sample_count;
+	int sample_deferred;
+
+	unsigned int sample_idx;
+	unsigned int sample_column;
+
+	int n_rates;
+	struct minstrel_rate *r;
+
+	/* sampling table */
+	u8 *sample_table;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct dentry *dbg_stats;
+#endif
+};
+
+struct minstrel_priv {
+	struct ieee80211_hw *hw;
+	bool has_mrr;
+	unsigned int cw_min;
+	unsigned int cw_max;
+	unsigned int max_retry;
+	unsigned int ewma_level;
+	unsigned int segment_size;
+	unsigned int update_interval;
+	unsigned int lookaround_rate;
+	unsigned int lookaround_rate_mrr;
+};
+
+void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
+void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
+
+#endif
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
new file mode 100644
index 00000000000..0b024cd6b80
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on minstrel.c:
+ *   Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz>
+ *   Sponsored by Indranet Technologies Ltd
+ *
+ * Based on sample.c:
+ *   Copyright (c) 2005 John Bicket
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer,
+ *      without modification.
+ *   2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *      similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *      redistribution must be conditioned upon including a substantially
+ *      similar Disclaimer requirement for further binary redistribution.
+ *   3. Neither the names of the above-listed copyright holders nor the names
+ *      of any contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *   Alternatively, this software may be distributed under the terms of the
+ *   GNU General Public License ("GPL") version 2 as published by the Free
+ *   Software Foundation.
+ *
+ *   NO WARRANTY
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ *   AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ *   THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ *   OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ *   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGES.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "rc80211_minstrel.h"
+
+struct minstrel_stats_info {
+	struct minstrel_sta_info *mi;
+	char buf[4096];
+	size_t len;
+};
+
+static int
+minstrel_stats_open(struct inode *inode, struct file *file)
+{
+	struct minstrel_sta_info *mi = inode->i_private;
+	struct minstrel_stats_info *ms;
+	unsigned int i, tp, prob, eprob;
+	char *p;
+
+	ms = kmalloc(sizeof(*ms), GFP_KERNEL);
+	if (!ms)
+		return -ENOMEM;
+
+	file->private_data = ms;
+	p = ms->buf;
+	p += sprintf(p, "rate     throughput  ewma prob   this prob  "
+			"this succ/attempt   success    attempts\n");
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+
+		*(p++) = (i == mi->max_tp_rate) ? 'T' : ' ';
+		*(p++) = (i == mi->max_tp_rate2) ? 't' : ' ';
+		*(p++) = (i == mi->max_prob_rate) ? 'P' : ' ';
+		p += sprintf(p, "%3u%s", mr->bitrate / 2,
+				(mr->bitrate & 1 ? ".5" : "  "));
+
+		tp = ((mr->cur_tp * 96) / 18000) >> 10;
+		prob = mr->cur_prob / 18;
+		eprob = mr->probability / 18;
+
+		p += sprintf(p, "  %6u.%1u   %6u.%1u   %6u.%1u        "
+				"%3u(%3u)   %8llu    %8llu\n",
+				tp / 10, tp % 10,
+				eprob / 10, eprob % 10,
+				prob / 10, prob % 10,
+				mr->last_success,
+				mr->last_attempts,
+				mr->succ_hist,
+				mr->att_hist);
+	}
+	p += sprintf(p, "\nTotal packet count::    ideal %d      "
+			"lookaround %d\n\n",
+			mi->packet_count - mi->sample_count,
+			mi->sample_count);
+	ms->len = p - ms->buf;
+
+	return 0;
+}
+
+static int
+minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *o)
+{
+	struct minstrel_stats_info *ms;
+	char *src;
+
+	ms = file->private_data;
+	src = ms->buf;
+
+	len = min(len, ms->len);
+	if (len <= *o)
+		return 0;
+
+	src += *o;
+	len -= *o;
+	*o += len;
+
+	if (copy_to_user(buf, src, len))
+		return -EFAULT;
+
+	return len;
+}
+
+static int
+minstrel_stats_release(struct inode *inode, struct file *file)
+{
+	struct minstrel_stats_info *ms = file->private_data;
+
+	kfree(ms);
+
+	return 0;
+}
+
+static struct file_operations minstrel_stat_fops = {
+	.owner = THIS_MODULE,
+	.open = minstrel_stats_open,
+	.read = minstrel_stats_read,
+	.release = minstrel_stats_release,
+};
+
+void
+minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+
+	mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi,
+			&minstrel_stat_fops);
+}
+
+void
+minstrel_remove_sta_debugfs(void *priv, void *priv_sta)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+
+	debugfs_remove(mi->dbg_stats);
+}
-- 
cgit v1.2.3


From ad788b5e079484aa1d48aa90a3ebd7d954d2e7db Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Wed, 1 Oct 2008 15:45:02 -0400
Subject: mac80211: avoid "Wireless Event too big" message for assoc response

The association response IEs are sent to userland with an IWEVCUSTOM
event, which unfortunately is limited to a little more than 100 bytes
of IE information with the encoding used.  Many APs send so much
IE information that this message overflows.  When the IWEVCUSTOM
event is too large, the kernel doesn't send it to userland anyway --
better just not to send it.

An attempt was made by Jouni Malinen to correct this issue by
converting to use IWEVASSOCREQIE and IWEVASSOCRESPIE messages instead
("mac80211: Use IWEVASSOCREQIE instead of IWEVCUSTOM").  Unfortunately,
that caused a problem due to 32-/64-bit interactions on some systems and
was reverted after the 'userland ABI' rule was invoked.  That leaves
us with this option instead of a proper fix, at least until we move
to a cfg80211-based solution.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index edc339d649c..49f86fa56bf 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -690,9 +690,11 @@ static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	memset(&wrqu, 0, sizeof(wrqu));
-	wrqu.data.length = len;
-	wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
+	if (len <= IW_CUSTOM_MAX) {
+		memset(&wrqu, 0, sizeof(wrqu));
+		wrqu.data.length = len;
+		wireless_send_event(sdata->dev, IWEVCUSTOM, &wrqu, buf);
+	}
 
 	kfree(buf);
 }
-- 
cgit v1.2.3


From 9a1f27c48065ce713eb47f2fd475b717e63ef239 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 7 Oct 2008 11:41:57 -0700
Subject: inet_hashtables: Add inet_lookup_skb helpers

To be able to use the cached socket reference in the skb during input
processing we add a new set of lookup functions that receive the skb on
their argument list.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c     | 5 ++---
 net/dccp/ipv6.c     | 6 ++----
 net/ipv4/tcp_ipv4.c | 3 +--
 net/ipv6/tcp_ipv6.c | 6 +-----
 4 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 882c5c4de69..e3dfddab21c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -811,9 +811,8 @@ static int dccp_v4_rcv(struct sk_buff *skb)
 
 	/* Step 2:
 	 *	Look up flow ID in table and get corresponding socket */
-	sk = __inet_lookup(dev_net(skb->dst->dev), &dccp_hashinfo,
-			   iph->saddr, dh->dccph_sport,
-			   iph->daddr, dh->dccph_dport, inet_iif(skb));
+	sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+			       dh->dccph_sport, dh->dccph_dport);
 	/*
 	 * Step 2:
 	 *	If no socket ...
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 5e1ee0da2c4..caa7f346962 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -805,10 +805,8 @@ static int dccp_v6_rcv(struct sk_buff *skb)
 
 	/* Step 2:
 	 *	Look up flow ID in table and get corresponding socket */
-	sk = __inet6_lookup(dev_net(skb->dst->dev), &dccp_hashinfo,
-			    &ipv6_hdr(skb)->saddr, dh->dccph_sport,
-			    &ipv6_hdr(skb)->daddr, ntohs(dh->dccph_dport),
-			    inet6_iif(skb));
+	sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+			        dh->dccph_sport, dh->dccph_dport);
 	/*
 	 * Step 2:
 	 *	If no socket ...
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8b24bd833cb..24ffc5e1d3d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1577,8 +1577,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->flags	 = iph->tos;
 	TCP_SKB_CB(skb)->sacked	 = 0;
 
-	sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
-			th->source, iph->daddr, th->dest, inet_iif(skb));
+	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
 	if (!sk)
 		goto no_tcp_socket;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index df16b68644e..6268d266c03 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1681,11 +1681,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(ipv6_hdr(skb));
 	TCP_SKB_CB(skb)->sacked = 0;
 
-	sk = __inet6_lookup(net, &tcp_hashinfo,
-			&ipv6_hdr(skb)->saddr, th->source,
-			&ipv6_hdr(skb)->daddr, ntohs(th->dest),
-			inet6_iif(skb));
-
+	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
 	if (!sk)
 		goto no_tcp_socket;
 
-- 
cgit v1.2.3


From 607c4aaf03041c8bd81555a0218050c0f895088e Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Tue, 7 Oct 2008 12:38:32 -0700
Subject: inet: Add udplib_lookup_skb() helpers

To be able to use the cached socket reference in the skb during input
processing we add a new set of lookup functions that receive the skb on
their argument list.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 14 ++++++++++++--
 net/ipv6/udp.c | 14 ++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c83d0ef469c..c7a90b546b2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -302,6 +302,17 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	return result;
 }
 
+static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
+						 __be16 sport, __be16 dport,
+						 struct hlist_head udptable[])
+{
+	const struct iphdr *iph = ip_hdr(skb);
+
+	return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport,
+				 iph->daddr, dport, inet_iif(skb),
+				 udptable);
+}
+
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			     __be32 daddr, __be16 dport, int dif)
 {
@@ -1208,8 +1219,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
 		return __udp4_lib_mcast_deliver(net, skb, uh,
 				saddr, daddr, udptable);
 
-	sk = __udp4_lib_lookup(net, saddr, uh->source, daddr,
-			uh->dest, inet_iif(skb), udptable);
+	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
 
 	if (sk != NULL) {
 		int ret = udp_queue_rcv_skb(sk, skb);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a6aecf76a71..ce26c41d157 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -107,6 +107,17 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 	return result;
 }
 
+static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
+					  __be16 sport, __be16 dport,
+					  struct hlist_head udptable[])
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	return __udp6_lib_lookup(dev_net(skb->dst->dev), &iph->saddr, sport,
+				 &iph->daddr, dport, inet6_iif(skb),
+				 udptable);
+}
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
@@ -488,8 +499,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
 	 * check socket cache ... must talk to Alan about his plans
 	 * for sock caches... i'll skip this for now.
 	 */
-	sk = __udp6_lib_lookup(net, saddr, uh->source,
-			       daddr, uh->dest, inet6_iif(skb), udptable);
+	sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
 
 	if (sk == NULL) {
 		if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
-- 
cgit v1.2.3


From 23542618deb77cfed312842fe8c41ed19fb16470 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Tue, 7 Oct 2008 12:41:01 -0700
Subject: inet: Don't lookup the socket if there's a socket attached to the skb

Use the socket cached in the skb if it's present.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 10 +++++++---
 net/ipv6/udp.c | 10 +++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c7a90b546b2..822c9deac83 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -306,11 +306,15 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 						 __be16 sport, __be16 dport,
 						 struct hlist_head udptable[])
 {
+	struct sock *sk;
 	const struct iphdr *iph = ip_hdr(skb);
 
-	return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport,
-				 iph->daddr, dport, inet_iif(skb),
-				 udptable);
+	if (unlikely(sk = skb_steal_sock(skb)))
+		return sk;
+	else
+		return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport,
+					 iph->daddr, dport, inet_iif(skb),
+					 udptable);
 }
 
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ce26c41d157..e51da8c092f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -111,11 +111,15 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 					  __be16 sport, __be16 dport,
 					  struct hlist_head udptable[])
 {
+	struct sock *sk;
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 
-	return __udp6_lib_lookup(dev_net(skb->dst->dev), &iph->saddr, sport,
-				 &iph->daddr, dport, inet6_iif(skb),
-				 udptable);
+	if (unlikely(sk = skb_steal_sock(skb)))
+		return sk;
+	else
+		return __udp6_lib_lookup(dev_net(skb->dst->dev), &iph->saddr, sport,
+					 &iph->daddr, dport, inet6_iif(skb),
+					 udptable);
 }
 
 /*
-- 
cgit v1.2.3


From 68fffc679694d5f7c02fdeb684b481416cd8213b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Oct 2008 14:12:10 -0700
Subject: ipv6: clean up ip6_route_net_init() error handling

ip6_route_net_init() error handling looked less than solid, fix 'er up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4385a6569c..635d97d54b0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2631,10 +2631,8 @@ static int ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
 					       sizeof(*net->ipv6.ip6_prohibit_entry),
 					       GFP_KERNEL);
-	if (!net->ipv6.ip6_prohibit_entry) {
-		kfree(net->ipv6.ip6_null_entry);
-		goto out;
-	}
+	if (!net->ipv6.ip6_prohibit_entry)
+		goto out_ip6_null_entry;
 	net->ipv6.ip6_prohibit_entry->u.dst.path =
 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
 	net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
@@ -2642,11 +2640,8 @@ static int ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
 					       GFP_KERNEL);
-	if (!net->ipv6.ip6_blk_hole_entry) {
-		kfree(net->ipv6.ip6_null_entry);
-		kfree(net->ipv6.ip6_prohibit_entry);
-		goto out;
-	}
+	if (!net->ipv6.ip6_blk_hole_entry)
+		goto out_ip6_prohibit_entry;
 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
@@ -2662,6 +2657,12 @@ static int ip6_route_net_init(struct net *net)
 out:
 	return ret;
 
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+out_ip6_prohibit_entry:
+	kfree(net->ipv6.ip6_prohibit_entry);
+out_ip6_null_entry:
+	kfree(net->ipv6.ip6_null_entry);
+#endif
 out_ip6_dst_ops:
 	release_net(net->ipv6.ip6_dst_ops->dst_net);
 	kfree(net->ipv6.ip6_dst_ops);
-- 
cgit v1.2.3


From b339a47c370ec669f789c5989f54eec1d78574bb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Oct 2008 14:15:00 -0700
Subject: ipv6: initialize ip6_route sysctl vars in ip6_route_net_init()

This makes that ip6_route_net_init() does all of the route init code.
There used to be a race between ip6_route_net_init() and ip6_net_init()
and someone relying on the combined result was left out cold.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 8 --------
 net/ipv6/route.c    | 9 +++++++++
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index f018704ecb8..af90905fd0f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -840,14 +840,6 @@ static int inet6_net_init(struct net *net)
 	int err = 0;
 
 	net->ipv6.sysctl.bindv6only = 0;
-	net->ipv6.sysctl.flush_delay = 0;
-	net->ipv6.sysctl.ip6_rt_max_size = 4096;
-	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
-	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
-	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
-	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
-	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
-	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
 	net->ipv6.sysctl.icmpv6_time = 1*HZ;
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 635d97d54b0..e10a1701550 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2647,6 +2647,15 @@ static int ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
 #endif
 
+	net->ipv6.sysctl.flush_delay = 0;
+	net->ipv6.sysctl.ip6_rt_max_size = 4096;
+	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
+	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
+	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
+	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
+	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
+	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+
 #ifdef CONFIG_PROC_FS
 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
-- 
cgit v1.2.3


From c57943a1c96214ee68f3890bb6772841ffbfd606 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Oct 2008 14:18:42 -0700
Subject: net: wrap sk->sk_backlog_rcv()

Wrap calling sk->sk_backlog_rcv() in a function. This will allow extending the
generic sk_backlog_rcv behaviour.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c      | 4 ++--
 net/ipv4/tcp.c       | 2 +-
 net/ipv4/tcp_timer.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 2d358dd8a03..5e2a3132a8c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -327,7 +327,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 		 */
 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 
-		rc = sk->sk_backlog_rcv(sk, skb);
+		rc = sk_backlog_rcv(sk, skb);
 
 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 	} else
@@ -1374,7 +1374,7 @@ static void __release_sock(struct sock *sk)
 			struct sk_buff *next = skb->next;
 
 			skb->next = NULL;
-			sk->sk_backlog_rcv(sk, skb);
+			sk_backlog_rcv(sk, skb);
 
 			/*
 			 * We are in process context here with softirqs
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7d81a1ee550..7d3fe571d15 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1161,7 +1161,7 @@ static void tcp_prequeue_process(struct sock *sk)
 	 * necessary */
 	local_bh_disable();
 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-		sk->sk_backlog_rcv(sk, skb);
+		sk_backlog_rcv(sk, skb);
 	local_bh_enable();
 
 	/* Clear memory counter. */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 5ab6ba19c3c..6b6dff1164b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -201,7 +201,7 @@ static void tcp_delack_timer(unsigned long data)
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
 
 		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-			sk->sk_backlog_rcv(sk, skb);
+			sk_backlog_rcv(sk, skb);
 
 		tp->ucopy.memory = 0;
 	}
-- 
cgit v1.2.3


From 654bed16cf86a9ef94495d9e6131b7ff7840a3dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Oct 2008 14:22:33 -0700
Subject: net: packet split receive api

Add some packet-split receive hooks.

For one this allows to do NUMA node affine page allocs. Later on these
hooks will be extended to do emergency reserve allocations for
fragments.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8bd248a6487..7f7bb1a636d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -263,6 +263,26 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 	return skb;
 }
 
+struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
+{
+	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
+	struct page *page;
+
+	page = alloc_pages_node(node, gfp_mask, 0);
+	return page;
+}
+EXPORT_SYMBOL(__netdev_alloc_page);
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+		int size)
+{
+	skb_fill_page_desc(skb, i, page, off, size);
+	skb->len += size;
+	skb->data_len += size;
+	skb->truesize += size;
+}
+EXPORT_SYMBOL(skb_add_rx_frag);
+
 /**
  *	dev_alloc_skb - allocate an skbuff for receiving
  *	@length: length to allocate
-- 
cgit v1.2.3


From 33f5f57eeb0c6386fdd85f9c690dc8d700ba7928 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Tue, 7 Oct 2008 14:43:06 -0700
Subject: tcp: kill pointless urg_mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It all started from me noticing that this urgent check in
tcp_clean_rtx_queue is unnecessarily inside the loop. Then
I took a longer look to it and found out that the users of
urg_mode can trivially do without, well almost, there was
one gotcha.

Bonus: those funny people who use urg with >= 2^31 write_seq -
snd_una could now rejoice too (that's the only purpose for the
between being there, otherwise a simple compare would have done
the thing). Not that I assume that the rest of the tcp code
happily lives with such mind-boggling numbers :-). Alas, it
turned out to be impossible to set wmem to such numbers anyway,
yes I really tried a big sendfile after setting some wmem but
nothing happened :-). ...Tcp_wmem is int and so is sk_sndbuf...
So I hacked a bit variable to long and found out that it seems
to work... :-)

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c           |  4 +---
 net/ipv4/tcp_input.c     | 11 ++++++-----
 net/ipv4/tcp_minisocks.c |  1 +
 net/ipv4/tcp_output.c    | 18 ++++++++++++------
 4 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7d3fe571d15..eccb7165a80 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -497,10 +497,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 				struct sk_buff *skb)
 {
-	if (flags & MSG_OOB) {
-		tp->urg_mode = 1;
+	if (flags & MSG_OOB)
 		tp->snd_up = tp->write_seq;
-	}
 }
 
 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3b76bce769d..c19f429dc44 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2836,7 +2836,8 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
  * is before the ack sequence we can discard it as it's confirmed to have
  * arrived at the other end.
  */
-static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
+static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
+			       u32 prior_snd_una)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2903,9 +2904,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
 
-		if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up)))
-			tp->urg_mode = 0;
-
 		tp->packets_out -= acked_pcount;
 		pkts_acked += acked_pcount;
 
@@ -2935,6 +2933,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 			tp->lost_skb_hint = NULL;
 	}
 
+	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
+		tp->snd_up = tp->snd_una;
+
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		flag |= FLAG_SACK_RENEGING;
 
@@ -3311,7 +3312,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		goto no_queue;
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, prior_fackets);
+	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
 
 	if (tp->frto_counter)
 		frto_cwnd = tcp_process_frto(sk, flag);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f976fc57892..779f2e9d068 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -395,6 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->pred_flags = 0;
 		newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
 		newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
+		newtp->snd_up = treq->snt_isn + 1;
 
 		tcp_prequeue_init(newtp);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 493553c71d3..990a5849323 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -345,6 +345,11 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 	TCP_SKB_CB(skb)->end_seq = seq;
 }
 
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+	return tp->snd_una != tp->snd_up;
+}
+
 #define OPTION_SACK_ADVERTISE	(1 << 0)
 #define OPTION_TS		(1 << 1)
 #define OPTION_MD5		(1 << 2)
@@ -646,7 +651,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	th->check		= 0;
 	th->urg_ptr		= 0;
 
-	if (unlikely(tp->urg_mode &&
+	/* The urg_mode check is necessary during a below snd_una win probe */
+	if (unlikely(tcp_urg_mode(tp) &&
 		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
 		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
 		th->urg			= 1;
@@ -1012,7 +1018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 /* Compute the current effective MSS, taking SACKs and IP options,
  * and even PMTU discovery events into account.
  *
- * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up
  * cannot be large. However, taking into account rare use of URG, this
  * is not a big flaw.
  */
@@ -1029,7 +1035,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 
 	mss_now = tp->mss_cache;
 
-	if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
+	if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp))
 		doing_tso = 1;
 
 	if (dst) {
@@ -1193,7 +1199,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
 	/* Don't use the nagle rule for urgent data (or for the final FIN).
 	 * Nagle can be ignored during F-RTO too (see RFC4138).
 	 */
-	if (tp->urg_mode || (tp->frto_counter == 2) ||
+	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
 	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
 		return 1;
 
@@ -2358,6 +2364,7 @@ static void tcp_connect_init(struct sock *sk)
 	tcp_init_wl(tp, tp->write_seq, 0);
 	tp->snd_una = tp->write_seq;
 	tp->snd_sml = tp->write_seq;
+	tp->snd_up = tp->write_seq;
 	tp->rcv_nxt = 0;
 	tp->rcv_wup = 0;
 	tp->copied_seq = 0;
@@ -2567,8 +2574,7 @@ int tcp_write_wakeup(struct sock *sk)
 			tcp_event_new_data_sent(sk, skb);
 		return err;
 	} else {
-		if (tp->urg_mode &&
-		    between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
 			tcp_xmit_probe_skb(sk, 1);
 		return tcp_xmit_probe_skb(sk, 0);
 	}
-- 
cgit v1.2.3


From 4a7e56098f06d505f23f8d7c8d6762221065922a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Tue, 7 Oct 2008 14:43:31 -0700
Subject: tcp: cleanup messy initializer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I'm quite sure that if I give this function in its old format
for you to inspect, you start to wonder what is the type of
demanded or if it's a global variable.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c19f429dc44..63da39372d4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4461,8 +4461,8 @@ static void tcp_new_space(struct sock *sk)
 
 	if (tcp_should_expand_sndbuf(sk)) {
 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
-			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
-		    demanded = max_t(unsigned int, tp->snd_cwnd,
+			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+		int demanded = max_t(unsigned int, tp->snd_cwnd,
 				     tp->reordering + 1);
 		sndmem *= 2 * demanded;
 		if (sndmem > sk->sk_sndbuf)
-- 
cgit v1.2.3


From 835bcc0497e18f54153ac9e32b598dd8ffb7aa66 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:45:55 -0700
Subject: netns: move /proc/net/dev_snmp6 to struct net

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/proc.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 0179b66864f..16ebf85d4ad 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -29,8 +29,6 @@
 #include <net/transp_v6.h>
 #include <net/ipv6.h>
 
-static struct proc_dir_entry *proc_net_devsnmp6;
-
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
@@ -210,18 +208,20 @@ static const struct file_operations snmp6_seq_fops = {
 int snmp6_register_dev(struct inet6_dev *idev)
 {
 	struct proc_dir_entry *p;
+	struct net *net;
 
 	if (!idev || !idev->dev)
 		return -EINVAL;
 
-	if (!net_eq(dev_net(idev->dev), &init_net))
+	net = dev_net(idev->dev);
+	if (!net_eq(net, &init_net))
 		return 0;
 
-	if (!proc_net_devsnmp6)
+	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 
 	p = proc_create_data(idev->dev->name, S_IRUGO,
-			     proc_net_devsnmp6, &snmp6_seq_fops, idev);
+			     net->mib.proc_net_devsnmp6, &snmp6_seq_fops, idev);
 	if (!p)
 		return -ENOMEM;
 
@@ -231,12 +231,13 @@ int snmp6_register_dev(struct inet6_dev *idev)
 
 int snmp6_unregister_dev(struct inet6_dev *idev)
 {
-	if (!proc_net_devsnmp6)
+	struct net *net = dev_net(idev->dev);
+	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 	if (!idev || !idev->stats.proc_dir_entry)
 		return -EINVAL;
 	remove_proc_entry(idev->stats.proc_dir_entry->name,
-			  proc_net_devsnmp6);
+			  net->mib.proc_net_devsnmp6);
 	idev->stats.proc_dir_entry = NULL;
 	return 0;
 }
@@ -269,8 +270,9 @@ int __init ipv6_misc_proc_init(void)
 	if (!proc_net_fops_create(&init_net, "snmp6", S_IRUGO, &snmp6_seq_fops))
 		goto proc_snmp6_fail;
 
-	proc_net_devsnmp6 = proc_mkdir("dev_snmp6", init_net.proc_net);
-	if (!proc_net_devsnmp6)
+	init_net.mib.proc_net_devsnmp6 =
+		proc_mkdir("dev_snmp6", init_net.proc_net);
+	if (!init_net.mib.proc_net_devsnmp6)
 		goto proc_dev_snmp6_fail;
 out:
 	return rc;
-- 
cgit v1.2.3


From 06f38527decedbea0588256ecbb5784d4bb35b81 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:46:18 -0700
Subject: netns: register /proc/net/dev_snmp6/* in each ns

Do the same for /proc/net/snmp6.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/proc.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 16ebf85d4ad..57640620c16 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -247,12 +247,27 @@ static int ipv6_proc_init_net(struct net *net)
 	if (!proc_net_fops_create(net, "sockstat6", S_IRUGO,
 			&sockstat6_seq_fops))
 		return -ENOMEM;
+
+	if (!proc_net_fops_create(net, "snmp6", S_IRUGO, &snmp6_seq_fops))
+		goto proc_snmp6_fail;
+
+	net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
+	if (!net->mib.proc_net_devsnmp6)
+		goto proc_dev_snmp6_fail;
 	return 0;
+
+proc_snmp6_fail:
+	proc_net_remove(net, "sockstat6");
+proc_dev_snmp6_fail:
+	proc_net_remove(net, "dev_snmp6");
+	return -ENOMEM;
 }
 
 static void ipv6_proc_exit_net(struct net *net)
 {
 	proc_net_remove(net, "sockstat6");
+	proc_net_remove(net, "dev_snmp6");
+	proc_net_remove(net, "snmp6");
 }
 
 static struct pernet_operations ipv6_proc_ops = {
@@ -262,34 +277,11 @@ static struct pernet_operations ipv6_proc_ops = {
 
 int __init ipv6_misc_proc_init(void)
 {
-	int rc = 0;
-
-	if (register_pernet_subsys(&ipv6_proc_ops))
-		goto proc_net_fail;
-
-	if (!proc_net_fops_create(&init_net, "snmp6", S_IRUGO, &snmp6_seq_fops))
-		goto proc_snmp6_fail;
-
-	init_net.mib.proc_net_devsnmp6 =
-		proc_mkdir("dev_snmp6", init_net.proc_net);
-	if (!init_net.mib.proc_net_devsnmp6)
-		goto proc_dev_snmp6_fail;
-out:
-	return rc;
-
-proc_dev_snmp6_fail:
-	proc_net_remove(&init_net, "snmp6");
-proc_snmp6_fail:
-	unregister_pernet_subsys(&ipv6_proc_ops);
-proc_net_fail:
-	rc = -ENOMEM;
-	goto out;
+	return register_pernet_subsys(&ipv6_proc_ops);
 }
 
 void ipv6_misc_proc_exit(void)
 {
-	proc_net_remove(&init_net, "dev_snmp6");
-	proc_net_remove(&init_net, "snmp6");
 	unregister_pernet_subsys(&ipv6_proc_ops);
 }
 
-- 
cgit v1.2.3


From 35f0a5df6cbc315da031c40541e135a059bfde7d Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:46:47 -0700
Subject: ipv6: consolidate ipv6 sock_stat code at the beginning of
 net/ipv6/proc.c

Simple, comsolidate sockstat6 staff in one place, at the beginning of
the file. Right now sockstat6_seq_open/sockstat6_seq_fops looks like an
intrusion in the middle of snmp6 code.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/proc.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 57640620c16..25eda8bf218 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -46,6 +46,19 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static int sockstat6_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, sockstat6_seq_show);
+}
+
+static const struct file_operations sockstat6_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = sockstat6_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+
 static struct snmp_mib snmp6_ipstats_list[] = {
 /* ipv6 mib according to RFC 2465 */
 	SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INRECEIVES),
@@ -179,19 +192,6 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int sockstat6_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, sockstat6_seq_show);
-}
-
-static const struct file_operations sockstat6_seq_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = sockstat6_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 static int snmp6_seq_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, snmp6_seq_show, PDE(inode)->data);
-- 
cgit v1.2.3


From 7b43ccecc77480353a5657d993d671cae9e94efd Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:47:12 -0700
Subject: ipv6: separate seq_ops for global & per/device ipv6 statistics

idev has been stored on seq->private. NULL has been stored for global
statistics.

The situation is changed with net namespace. We need to store pointer to
struct net and the only place is seq->private. So, we'll have for
/proc/net/dev_snmp6/* and for /proc/net/snmp6 pointers of two different
types stored in the same field.

This effectively requires to separate seq_ops of these files.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/proc.c | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 25eda8bf218..7601f56ce91 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -175,26 +175,17 @@ snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist)
 
 static int snmp6_seq_show(struct seq_file *seq, void *v)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)seq->private;
-
-	if (idev) {
-		seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
-		snmp6_seq_show_item(seq, (void **)idev->stats.ipv6, snmp6_ipstats_list);
-		snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list);
-		snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg);
-	} else {
-		snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
-		snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
-		snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics);
-		snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list);
-		snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list);
-	}
+	snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
+	snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
+	snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics);
+	snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list);
+	snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list);
 	return 0;
 }
 
 static int snmp6_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, snmp6_seq_show, PDE(inode)->data);
+	return single_open(file, snmp6_seq_show, NULL);
 }
 
 static const struct file_operations snmp6_seq_fops = {
@@ -205,6 +196,30 @@ static const struct file_operations snmp6_seq_fops = {
 	.release = single_release,
 };
 
+static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
+{
+	struct inet6_dev *idev = (struct inet6_dev *)seq->private;
+
+	seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
+	snmp6_seq_show_item(seq, (void **)idev->stats.ipv6, snmp6_ipstats_list);
+	snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list);
+	snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg);
+	return 0;
+}
+
+static int snmp6_dev_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, snmp6_dev_seq_show, PDE(inode)->data);
+}
+
+static const struct file_operations snmp6_dev_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = snmp6_dev_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
 int snmp6_register_dev(struct inet6_dev *idev)
 {
 	struct proc_dir_entry *p;
@@ -221,7 +236,8 @@ int snmp6_register_dev(struct inet6_dev *idev)
 		return -ENOENT;
 
 	p = proc_create_data(idev->dev->name, S_IRUGO,
-			     net->mib.proc_net_devsnmp6, &snmp6_seq_fops, idev);
+			     net->mib.proc_net_devsnmp6,
+			     &snmp6_dev_seq_fops, idev);
 	if (!p)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 2b4209e4b7ba9c7d70910a665ae4b60d49b75fcd Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:47:37 -0700
Subject: netns: register global ipv6 mibs statistics in each namespace

Unused net variable will become used very soon.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/proc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 7601f56ce91..c38c9e55406 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -175,6 +175,8 @@ snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist)
 
 static int snmp6_seq_show(struct seq_file *seq, void *v)
 {
+	struct net *net = (struct net *)seq->private;
+
 	snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
 	snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
 	snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics);
@@ -185,7 +187,7 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 
 static int snmp6_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, snmp6_seq_show, NULL);
+	return single_open_net(inode, file, snmp6_seq_show);
 }
 
 static const struct file_operations snmp6_seq_fops = {
@@ -193,7 +195,7 @@ static const struct file_operations snmp6_seq_fops = {
 	.open	 = snmp6_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = single_release,
+	.release = single_release_net,
 };
 
 static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
-- 
cgit v1.2.3


From ab38dc7a70e59a4888ab4acb51daf3c6012ce4b8 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:47:55 -0700
Subject: netns: allow per device ipv6 snmp statistics in non-initial namespace

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/proc.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index c38c9e55406..23e567fb1d3 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -231,9 +231,6 @@ int snmp6_register_dev(struct inet6_dev *idev)
 		return -EINVAL;
 
 	net = dev_net(idev->dev);
-	if (!net_eq(net, &init_net))
-		return 0;
-
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 
-- 
cgit v1.2.3


From e43291cb37406dae405d50332eaa1ba2264c8dce Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:48:53 -0700
Subject: netns: add stub functions for per/namespace mibs allocation

The content of init_ipv6_mibs/cleanup_ipv6_mibs will be moved to new
calls one by one next.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index af90905fd0f..e8f82eca160 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -835,6 +835,15 @@ static void cleanup_ipv6_mibs(void)
 	snmp_mib_free((void **)udplite_stats_in6);
 }
 
+static int __net_init ipv6_init_mibs(struct net *net)
+{
+	return 0;
+}
+
+static void __net_exit ipv6_cleanup_mibs(struct net *net)
+{
+}
+
 static int inet6_net_init(struct net *net)
 {
 	int err = 0;
@@ -842,6 +851,9 @@ static int inet6_net_init(struct net *net)
 	net->ipv6.sysctl.bindv6only = 0;
 	net->ipv6.sysctl.icmpv6_time = 1*HZ;
 
+	err = ipv6_init_mibs(net);
+	if (err)
+		return err;
 #ifdef CONFIG_PROC_FS
 	err = udp6_proc_init(net);
 	if (err)
@@ -852,7 +864,6 @@ static int inet6_net_init(struct net *net)
 	err = ac6_proc_init(net);
 	if (err)
 		goto proc_ac6_fail;
-out:
 #endif
 	return err;
 
@@ -861,7 +872,9 @@ proc_ac6_fail:
 	tcp6_proc_exit(net);
 proc_tcp6_fail:
 	udp6_proc_exit(net);
-	goto out;
+out:
+	ipv6_cleanup_mibs(net);
+	return err;
 #endif
 }
 
@@ -872,6 +885,7 @@ static void inet6_net_exit(struct net *net)
 	tcp6_proc_exit(net);
 	ac6_proc_exit(net);
 #endif
+	ipv6_cleanup_mibs(net);
 }
 
 static struct pernet_operations inet6_net_ops = {
-- 
cgit v1.2.3


From 0c7ed677fb7013c8028045d409a48ac42151187a Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:49:36 -0700
Subject: netns: make udpv6 mib per/namespace

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c      | 3 ---
 net/ipv6/af_inet6.c | 9 ++++-----
 net/ipv6/proc.c     | 3 ++-
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 822c9deac83..85f8e8e10b1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -108,9 +108,6 @@
  *	Snmp MIB for the UDP layer
  */
 
-DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
-EXPORT_SYMBOL(udp_stats_in6);
-
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e8f82eca160..e09139122ef 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -806,16 +806,12 @@ static int __init init_ipv6_mibs(void)
 	if (snmp_mib_init((void **)icmpv6msg_statistics,
 			  sizeof(struct icmpv6msg_mib)) < 0)
 		goto err_icmpmsg_mib;
-	if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0)
-		goto err_udp_mib;
 	if (snmp_mib_init((void **)udplite_stats_in6,
 			  sizeof (struct udp_mib)) < 0)
 		goto err_udplite_mib;
 	return 0;
 
 err_udplite_mib:
-	snmp_mib_free((void **)udp_stats_in6);
-err_udp_mib:
 	snmp_mib_free((void **)icmpv6msg_statistics);
 err_icmpmsg_mib:
 	snmp_mib_free((void **)icmpv6_statistics);
@@ -831,17 +827,20 @@ static void cleanup_ipv6_mibs(void)
 	snmp_mib_free((void **)ipv6_statistics);
 	snmp_mib_free((void **)icmpv6_statistics);
 	snmp_mib_free((void **)icmpv6msg_statistics);
-	snmp_mib_free((void **)udp_stats_in6);
 	snmp_mib_free((void **)udplite_stats_in6);
 }
 
 static int __net_init ipv6_init_mibs(struct net *net)
 {
+	if (snmp_mib_init((void **)net->mib.udp_stats_in6,
+			  sizeof (struct udp_mib)) < 0)
+		return -ENOMEM;
 	return 0;
 }
 
 static void __net_exit ipv6_cleanup_mibs(struct net *net)
 {
+	snmp_mib_free((void **)net->mib.udp_stats_in6);
 }
 
 static int inet6_net_init(struct net *net)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 23e567fb1d3..3eaf20bf998 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -180,7 +180,8 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 	snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
 	snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
 	snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics);
-	snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list);
+	snmp6_seq_show_item(seq, (void **)net->mib.udp_stats_in6,
+			    snmp6_udp6_list);
 	snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list);
 	return 0;
 }
-- 
cgit v1.2.3


From be713a443ee019489890e93654557916fbf72612 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 7 Oct 2008 14:50:06 -0700
Subject: netns: make uplitev6 mib per/namespace

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 14 ++++++++------
 net/ipv6/proc.c     |  3 ++-
 net/ipv6/udplite.c  |  2 --
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e09139122ef..127b240d2d8 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -806,13 +806,8 @@ static int __init init_ipv6_mibs(void)
 	if (snmp_mib_init((void **)icmpv6msg_statistics,
 			  sizeof(struct icmpv6msg_mib)) < 0)
 		goto err_icmpmsg_mib;
-	if (snmp_mib_init((void **)udplite_stats_in6,
-			  sizeof (struct udp_mib)) < 0)
-		goto err_udplite_mib;
 	return 0;
 
-err_udplite_mib:
-	snmp_mib_free((void **)icmpv6msg_statistics);
 err_icmpmsg_mib:
 	snmp_mib_free((void **)icmpv6_statistics);
 err_icmp_mib:
@@ -827,7 +822,6 @@ static void cleanup_ipv6_mibs(void)
 	snmp_mib_free((void **)ipv6_statistics);
 	snmp_mib_free((void **)icmpv6_statistics);
 	snmp_mib_free((void **)icmpv6msg_statistics);
-	snmp_mib_free((void **)udplite_stats_in6);
 }
 
 static int __net_init ipv6_init_mibs(struct net *net)
@@ -835,12 +829,20 @@ static int __net_init ipv6_init_mibs(struct net *net)
 	if (snmp_mib_init((void **)net->mib.udp_stats_in6,
 			  sizeof (struct udp_mib)) < 0)
 		return -ENOMEM;
+	if (snmp_mib_init((void **)net->mib.udplite_stats_in6,
+			  sizeof (struct udp_mib)) < 0)
+		goto err_udplite_mib;
 	return 0;
+
+err_udplite_mib:
+	snmp_mib_free((void **)net->mib.udp_stats_in6);
+	return -ENOMEM;
 }
 
 static void __net_exit ipv6_cleanup_mibs(struct net *net)
 {
 	snmp_mib_free((void **)net->mib.udp_stats_in6);
+	snmp_mib_free((void **)net->mib.udplite_stats_in6);
 }
 
 static int inet6_net_init(struct net *net)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 3eaf20bf998..c78cf754ef3 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -182,7 +182,8 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 	snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics);
 	snmp6_seq_show_item(seq, (void **)net->mib.udp_stats_in6,
 			    snmp6_udp6_list);
-	snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list);
+	snmp6_seq_show_item(seq, (void **)net->mib.udplite_stats_in6,
+			    snmp6_udplite6_list);
 	return 0;
 }
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index f6cdcb348e0..3cd1a1ac3d6 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -13,8 +13,6 @@
  */
 #include "udp_impl.h"
 
-DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6) __read_mostly;
-
 static int udplitev6_rcv(struct sk_buff *skb)
 {
 	return __udp6_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
-- 
cgit v1.2.3


From c9f6cde6e26ef98ee9c4b6288b126ac9c580d88b Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 31 Jul 2008 09:53:56 +0400
Subject: sunrpc: do not pin sunrpc module in the memory

Basically, try_module_get here are pretty useless. Any other module using
this API will pin sunrpc in memory due using exported symbols.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 99a52aabe33..29e401bb612 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport)
 			goto out;
 	}
 
-	result = -EINVAL;
-	if (try_module_get(THIS_MODULE)) {
-		list_add_tail(&transport->list, &xprt_list);
-		printk(KERN_INFO "RPC: Registered %s transport module.\n",
-			transport->name);
-		result = 0;
-	}
+	list_add_tail(&transport->list, &xprt_list);
+	printk(KERN_INFO "RPC: Registered %s transport module.\n",
+	       transport->name);
+	result = 0;
 
 out:
 	spin_unlock(&xprt_list_lock);
@@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport)
 				"RPC: Unregistered %s transport module.\n",
 				transport->name);
 			list_del_init(&transport->list);
-			module_put(THIS_MODULE);
 			goto out;
 		}
 	}
-- 
cgit v1.2.3


From 9a4bd29fe8f6d3f015fe1c8e5450eb62cfebfcc9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 3 Oct 2008 16:48:34 -0400
Subject: SUNRPC: Fix autobind on cloned rpc clients

Despite the fact that cloned rpc clients won't have the cl_autobind flag
set, they may still find themselves calling rpcb_getport_async(). For this
to happen, it suffices for a _parent_ rpc_clnt to use autobinding, in which
case any clone may find itself triggering the !xprt_bound() case in
call_bind().

The correct fix for this is to walk back up the tree of cloned rpc clients,
in order to find the parent that 'owns' the transport, either because it
has clnt->cl_autobind set, or because it originally created the
transport...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 24db2b4d12d..172935b046d 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -469,6 +469,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
 	return rpc_run_task(&task_setup_data);
 }
 
+/*
+ * In the case where rpc clients have been cloned, we want to make
+ * sure that we use the program number/version etc of the actual
+ * owner of the xprt. To do so, we walk back up the tree of parents
+ * to find whoever created the transport and/or whoever has the
+ * autobind flag set.
+ */
+static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt)
+{
+	struct rpc_clnt *parent = clnt->cl_parent;
+
+	while (parent != clnt) {
+		if (parent->cl_xprt != clnt->cl_xprt)
+			break;
+		if (clnt->cl_autobind)
+			break;
+		clnt = parent;
+		parent = parent->cl_parent;
+	}
+	return clnt;
+}
+
 /**
  * rpcb_getport_async - obtain the port for a given RPC service on a given host
  * @task: task that is waiting for portmapper request
@@ -478,10 +500,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
  */
 void rpcb_getport_async(struct rpc_task *task)
 {
-	struct rpc_clnt *clnt = task->tk_client;
+	struct rpc_clnt *clnt;
 	struct rpc_procinfo *proc;
 	u32 bind_version;
-	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpc_xprt *xprt;
 	struct rpc_clnt	*rpcb_clnt;
 	static struct rpcbind_args *map;
 	struct rpc_task	*child;
@@ -490,13 +512,13 @@ void rpcb_getport_async(struct rpc_task *task)
 	size_t salen;
 	int status;
 
+	clnt = rpcb_find_transport_owner(task->tk_client);
+	xprt = clnt->cl_xprt;
+
 	dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
 		task->tk_pid, __func__,
 		clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot);
 
-	/* Autobind on cloned rpc clients is discouraged */
-	BUG_ON(clnt->cl_parent != clnt);
-
 	/* Put self on the wait queue to ensure we get notified if
 	 * some other task is already attempting to bind the port */
 	rpc_sleep_on(&xprt->binding, task, NULL);
@@ -578,9 +600,9 @@ void rpcb_getport_async(struct rpc_task *task)
 			task->tk_pid, __func__);
 		return;
 	}
-	rpc_put_task(child);
 
-	task->tk_xprt->stat.bind_count++;
+	xprt->stat.bind_count++;
+	rpc_put_task(child);
 	return;
 
 bailout_nofree:
-- 
cgit v1.2.3


From 96165e2b7c4e2c82a0b60c766d4a2036444c21a0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 3 Oct 2008 16:48:40 -0400
Subject: SUNRPC: Fix a memory leak in rpcb_getport_async

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 172935b046d..0a22f00734a 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -580,7 +580,7 @@ void rpcb_getport_async(struct rpc_task *task)
 		status = -ENOMEM;
 		dprintk("RPC: %5u %s: no memory available\n",
 			task->tk_pid, __func__);
-		goto bailout_nofree;
+		goto bailout_release_client;
 	}
 	map->r_prog = clnt->cl_prog;
 	map->r_vers = clnt->cl_vers;
@@ -605,6 +605,8 @@ void rpcb_getport_async(struct rpc_task *task)
 	rpc_put_task(child);
 	return;
 
+bailout_release_client:
+	rpc_release_client(rpcb_clnt);
 bailout_nofree:
 	rpcb_wake_rpcbind_waiters(xprt, status);
 	task->tk_status = status;
-- 
cgit v1.2.3


From 63ffc23d307c9534c732edd87895e37b223004a3 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Fri, 3 Oct 2008 23:41:51 -0400
Subject: sunrpc: fix oops in rpc_create when the mount namespace is unshared

On a system with nfs mounts, if a task unshares its mount namespace,
a oops can occur when the system is rebooted if the task is the last
to unreference the nfs mount. It will try to create a rpc request
using utsname() which has been invalidated by free_nsproxy().

The patch fixes the issue by using the global init_utsname() which is
always valid. the capability of identifying rpc clients per uts namespace
stills needs some extra work so this should not be a problem.

BUG: unable to handle kernel NULL pointer dereference at 00000004
IP: [<c024c9ab>] rpc_create+0x332/0x42f
Oops: 0000 [#1] DEBUG_PAGEALLOC

Pid: 1857, comm: uts-oops Not tainted (2.6.27-rc5-00319-g7686ad5 #4)
EIP: 0060:[<c024c9ab>] EFLAGS: 00210287 CPU: 0
EIP is at rpc_create+0x332/0x42f
EAX: 00000000 EBX: df26adf0 ECX: c0251887 EDX: 00000001
ESI: df26ae58 EDI: c02f293c EBP: dda0fc9c ESP: dda0fc2c
 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
Process uts-oops (pid: 1857, ti=dda0e000 task=dd9a0778 task.ti=dda0e000)
Stack: c0104532 dda0fffc dda0fcac dda0e000 dda0e000 dd93b7f0 00000009 c02f2880
       df26aefc dda0fc68 c01096b7 00000000 c0266ee0 c039a070 c039a070 dda0fc74
       c012ca67 c039a064 dda0fc8c c012cb20 c03daf74 00000011 00000000 c0275c90
Call Trace:
 [<c0104532>] ? dump_trace+0xc2/0xe2
 [<c01096b7>] ? save_stack_trace+0x1c/0x3a
 [<c012ca67>] ? save_trace+0x37/0x8c
 [<c012cb20>] ? add_lock_to_list+0x64/0x96
 [<c0256fc4>] ? rpcb_register_call+0x62/0xbb
 [<c02570c8>] ? rpcb_register+0xab/0xb3
 [<c0252f4d>] ? svc_register+0xb4/0x128
 [<c0253114>] ? svc_destroy+0xec/0x103
 [<c02531b2>] ? svc_exit_thread+0x87/0x8d
 [<c01a75cd>] ? lockd_down+0x61/0x81
 [<c01a577b>] ? nlmclnt_done+0xd/0xf
 [<c01941fe>] ? nfs_destroy_server+0x14/0x16
 [<c0194328>] ? nfs_free_server+0x4c/0xaa
 [<c019a066>] ? nfs_kill_super+0x23/0x27
 [<c0158585>] ? deactivate_super+0x3f/0x51
 [<c01695d1>] ? mntput_no_expire+0x95/0xb4
 [<c016965b>] ? release_mounts+0x6b/0x7a
 [<c01696cc>] ? __put_mnt_ns+0x62/0x70
 [<c0127501>] ? free_nsproxy+0x25/0x80
 [<c012759a>] ? switch_task_namespaces+0x3e/0x43
 [<c01275a9>] ? exit_task_namespaces+0xa/0xc
 [<c0117fed>] ? do_exit+0x4fd/0x666
 [<c01181b3>] ? do_group_exit+0x5d/0x83
 [<c011fa8c>] ? get_signal_to_deliver+0x2c8/0x2e0
 [<c0102630>] ? do_notify_resume+0x69/0x700
 [<c011d85a>] ? do_sigaction+0x134/0x145
 [<c0127205>] ? hrtimer_nanosleep+0x8f/0xce
 [<c0126d1a>] ? hrtimer_wakeup+0x0/0x1c
 [<c0103488>] ? work_notifysig+0x13/0x1b
 =======================
Code: 70 20 68 cb c1 2c c0 e8 75 4e 01 00 8b 83 ac 00 00 00 59 3d 00 f0 ff ff 5f 77 63 eb 57 a1 00 80 2d c0 8b 80 a8 02 00 00 8d 73 68 <8b> 40 04 83 c0 45 e8 41 46 f7 ff ba 20 00 00 00 83 f8 21 0f 4c
EIP: [<c024c9ab>] rpc_create+0x332/0x42f SS:ESP 0068:dda0fc2c

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Serge E. Hallyn" <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 76739e928d0..a59cdf423c1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
 	}
 
 	/* save the nodename */
-	clnt->cl_nodelen = strlen(utsname()->nodename);
+	clnt->cl_nodelen = strlen(init_utsname()->nodename);
 	if (clnt->cl_nodelen > UNX_MAXNODENAME)
 		clnt->cl_nodelen = UNX_MAXNODENAME;
-	memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen);
+	memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen);
 	rpc_register_client(clnt);
 	return clnt;
 
-- 
cgit v1.2.3


From b8bae41ed6a53cce56c50811a91cd963e3187d1c Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Tue, 7 Oct 2008 15:34:37 -0700
Subject: ipv4: add mc_count to in_device.

This patch add mc_count to struct in_device and updates
increment/decrement/initilaize of this field in IPv4 and in IPv6.

- Also printing the vfs /proc entry (/proc/net/igmp) is adjusted to
use the new mc_count.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index f70fac61259..7f9e337e390 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1234,6 +1234,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	write_lock_bh(&in_dev->mc_list_lock);
 	im->next=in_dev->mc_list;
 	in_dev->mc_list=im;
+	in_dev->mc_count++;
 	write_unlock_bh(&in_dev->mc_list_lock);
 #ifdef CONFIG_IP_MULTICAST
 	igmpv3_del_delrec(in_dev, im->multiaddr);
@@ -1282,6 +1283,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
 			if (--i->users == 0) {
 				write_lock_bh(&in_dev->mc_list_lock);
 				*ip = i->next;
+				in_dev->mc_count--;
 				write_unlock_bh(&in_dev->mc_list_lock);
 				igmp_group_dropped(i);
 
@@ -1330,6 +1332,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
 	setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
 			(unsigned long)in_dev);
 	in_dev->mr_ifc_count = 0;
+	in_dev->mc_count     = 0;
 	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
 			(unsigned long)in_dev);
 	in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
@@ -1369,8 +1372,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
 	write_lock_bh(&in_dev->mc_list_lock);
 	while ((i = in_dev->mc_list) != NULL) {
 		in_dev->mc_list = i->next;
+		in_dev->mc_count--;
 		write_unlock_bh(&in_dev->mc_list_lock);
-
 		igmp_group_dropped(i);
 		ip_ma_put(i);
 
@@ -2383,7 +2386,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
 
 		if (state->in_dev->mc_list == im) {
 			seq_printf(seq, "%d\t%-10s: %5d %7s\n",
-				   state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
+				   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
 		}
 
 		seq_printf(seq,
-- 
cgit v1.2.3


From 76108cea065cda58366d16a7eb6ca90d717a1396 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:00 +0200
Subject: netfilter: Use unsigned types for hooknum and pf vars

and (try to) consistently use u_int8_t for the L3 family.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/br_netfilter.c                      |  4 +--
 net/bridge/netfilter/ebt_log.c                 |  2 +-
 net/bridge/netfilter/ebt_ulog.c                |  2 +-
 net/ipv4/netfilter/ipt_LOG.c                   |  2 +-
 net/ipv4/netfilter/ipt_ULOG.c                  |  2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  4 +--
 net/ipv6/netfilter/ip6t_LOG.c                  |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  4 +--
 net/netfilter/core.c                           |  4 +--
 net/netfilter/nf_conntrack_core.c              |  6 ++--
 net/netfilter/nf_conntrack_expect.c            |  2 +-
 net/netfilter/nf_conntrack_h323_main.c         |  3 +-
 net/netfilter/nf_conntrack_proto_dccp.c        |  4 +--
 net/netfilter/nf_conntrack_proto_generic.c     |  2 +-
 net/netfilter/nf_conntrack_proto_gre.c         |  2 +-
 net/netfilter/nf_conntrack_proto_sctp.c        |  2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         |  6 ++--
 net/netfilter/nf_conntrack_proto_udp.c         |  4 +--
 net/netfilter/nf_conntrack_proto_udplite.c     |  4 +--
 net/netfilter/nf_internals.h                   |  4 +--
 net/netfilter/nf_log.c                         |  6 ++--
 net/netfilter/nf_queue.c                       | 10 +++---
 net/netfilter/nf_sockopt.c                     | 15 ++++----
 net/netfilter/nfnetlink_log.c                  |  4 +--
 net/netfilter/x_tables.c                       | 47 ++++++++++++++------------
 net/netfilter/xt_connlimit.c                   |  2 +-
 net/netfilter/xt_conntrack.c                   |  8 ++---
 net/netfilter/xt_hashlimit.c                   | 11 +++---
 28 files changed, 86 insertions(+), 82 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 6a9a6cd74b1..a4abed5b4c4 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -657,7 +657,7 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
 {
 	struct nf_bridge_info *nf_bridge;
 	struct net_device *parent;
-	int pf;
+	u_int8_t pf;
 
 	if (!skb->nf_bridge)
 		return NF_ACCEPT;
@@ -791,7 +791,7 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct net_device *realoutdev = bridge_parent(skb->dev);
-	int pf;
+	u_int8_t pf;
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	/* Be very paranoid. This probably won't happen anymore, but let's
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 2f430d4ae91..3770cd8a7b3 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -84,7 +84,7 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
 
 #define myNIPQUAD(a) a[0], a[1], a[2], a[3]
 static void
-ebt_log_packet(unsigned int pf, unsigned int hooknum,
+ebt_log_packet(u_int8_t pf, unsigned int hooknum,
    const struct sk_buff *skb, const struct net_device *in,
    const struct net_device *out, const struct nf_loginfo *loginfo,
    const char *prefix)
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 2d4c9ef909f..c84bda61afb 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -223,7 +223,7 @@ alloc_failure:
 }
 
 /* this function is registered with the netfilter core */
-static void ebt_log_packet(unsigned int pf, unsigned int hooknum,
+static void ebt_log_packet(u_int8_t pf, unsigned int hooknum,
    const struct sk_buff *skb, const struct net_device *in,
    const struct net_device *out, const struct nf_loginfo *li,
    const char *prefix)
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 0af14137137..9330ba3577e 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -375,7 +375,7 @@ static struct nf_loginfo default_loginfo = {
 };
 
 static void
-ipt_log_packet(unsigned int pf,
+ipt_log_packet(u_int8_t pf,
 	       unsigned int hooknum,
 	       const struct sk_buff *skb,
 	       const struct net_device *in,
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index b192756c6d0..d8241e6b077 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -292,7 +292,7 @@ ulog_tg(struct sk_buff *skb, const struct net_device *in,
 	return XT_CONTINUE;
 }
 
-static void ipt_logfn(unsigned int pf,
+static void ipt_logfn(u_int8_t pf,
 		      unsigned int hooknum,
 		      const struct sk_buff *skb,
 		      const struct net_device *in,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 97791048fa9..da8edcdaef3 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -79,7 +79,7 @@ static int icmp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
-		       int pf,
+		       u_int8_t pf,
 		       unsigned int hooknum)
 {
 	/* Try to delete connection immediately after all replies:
@@ -173,7 +173,7 @@ icmp_error_message(struct sk_buff *skb,
 /* Small and modified version of icmp_rcv */
 static int
 icmp_error(struct sk_buff *skb, unsigned int dataoff,
-	   enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
+	   enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
 {
 	const struct icmphdr *icmph;
 	struct icmphdr _ih;
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 3a2316974f8..0716f8afa0b 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -385,7 +385,7 @@ static struct nf_loginfo default_loginfo = {
 };
 
 static void
-ip6t_log_packet(unsigned int pf,
+ip6t_log_packet(u_int8_t pf,
 		unsigned int hooknum,
 		const struct sk_buff *skb,
 		const struct net_device *in,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 14d47d83354..5756f30ebc6 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -81,7 +81,7 @@ static int icmpv6_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
-		       int pf,
+		       u_int8_t pf,
 		       unsigned int hooknum)
 {
 	/* Try to delete connection immediately after all replies:
@@ -173,7 +173,7 @@ icmpv6_error_message(struct sk_buff *skb,
 
 static int
 icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
-	     enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
+	     enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
 {
 	const struct icmp6hdr *icmp6h;
 	struct icmp6hdr _ih;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 292fa28146f..26b8f489d7a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL(nf_unregister_hooks);
 
 unsigned int nf_iterate(struct list_head *head,
 			struct sk_buff *skb,
-			int hook,
+			unsigned int hook,
 			const struct net_device *indev,
 			const struct net_device *outdev,
 			struct list_head **i,
@@ -155,7 +155,7 @@ unsigned int nf_iterate(struct list_head *head,
 
 /* Returns 1 if okfn() needs to be executed by the caller,
  * -EPERM for NF_DROP, 0 otherwise. */
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb,
+int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
 		 struct net_device *indev,
 		 struct net_device *outdev,
 		 int (*okfn)(struct sk_buff *),
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9d1830da8e8..6aaf64b5ded 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -665,7 +665,7 @@ resolve_normal_ct(struct sk_buff *skb,
 }
 
 unsigned int
-nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb)
+nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -683,7 +683,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb)
 	}
 
 	/* rcu_read_lock()ed by nf_hook_slow */
-	l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
+	l3proto = __nf_ct_l3proto_find(pf);
 	ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
 				   &dataoff, &protonum);
 	if (ret <= 0) {
@@ -693,7 +693,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb)
 		return -ret;
 	}
 
-	l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum);
+	l4proto = __nf_ct_l4proto_find(pf, protonum);
 
 	/* It may be an special packet, error, unclean...
 	 * inverse of the return code tells to the netfilter
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index e8f0dead267..990fa12f2ee 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -241,7 +241,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
 EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
 
 void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
-		       int family,
+		       u_int8_t family,
 		       const union nf_inet_addr *saddr,
 		       const union nf_inet_addr *daddr,
 		       u_int8_t proto, const __be16 *src, const __be16 *dst)
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 2f83c158934..5dc0478108a 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -709,7 +709,8 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
 /* If the calling party is on the same side of the forward-to party,
  * we don't need to track the second call */
 static int callforward_do_filter(const union nf_inet_addr *src,
-                                 const union nf_inet_addr *dst, int family)
+				 const union nf_inet_addr *dst,
+				 u_int8_t family)
 {
 	const struct nf_afinfo *afinfo;
 	struct flowi fl1, fl2;
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index e7866dd3cde..edc30358dc1 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -461,7 +461,7 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
 
 static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
-		       int pf, unsigned int hooknum)
+		       u_int8_t pf, unsigned int hooknum)
 {
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	struct dccp_hdr _dh, *dh;
@@ -546,7 +546,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 }
 
 static int dccp_error(struct sk_buff *skb, unsigned int dataoff,
-		      enum ip_conntrack_info *ctinfo, int pf,
+		      enum ip_conntrack_info *ctinfo, u_int8_t pf,
 		      unsigned int hooknum)
 {
 	struct dccp_hdr _dh, *dh;
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index e31b0e7bd0b..dbe680af85d 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -45,7 +45,7 @@ static int packet(struct nf_conn *ct,
 		  const struct sk_buff *skb,
 		  unsigned int dataoff,
 		  enum ip_conntrack_info ctinfo,
-		  int pf,
+		  u_int8_t pf,
 		  unsigned int hooknum)
 {
 	nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 9bd03967fea..c5a78220fa3 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -219,7 +219,7 @@ static int gre_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
-		      int pf,
+		      u_int8_t pf,
 		      unsigned int hooknum)
 {
 	/* If we've seen traffic both ways, this is a GRE connection.
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 30aa5b94a77..b5a90596d3f 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -287,7 +287,7 @@ static int sctp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
-		       int pf,
+		       u_int8_t pf,
 		       unsigned int hooknum)
 {
 	enum sctp_conntrack new_state, old_state;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 6f61261888e..539a8202025 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -486,7 +486,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			  const struct sk_buff *skb,
 			  unsigned int dataoff,
 			  const struct tcphdr *tcph,
-			  int pf)
+			  u_int8_t pf)
 {
 	struct ip_ct_tcp_state *sender = &state->seen[dir];
 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
@@ -749,7 +749,7 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] =
 static int tcp_error(struct sk_buff *skb,
 		     unsigned int dataoff,
 		     enum ip_conntrack_info *ctinfo,
-		     int pf,
+		     u_int8_t pf,
 		     unsigned int hooknum)
 {
 	const struct tcphdr *th;
@@ -804,7 +804,7 @@ static int tcp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
-		      int pf,
+		      u_int8_t pf,
 		      unsigned int hooknum)
 {
 	struct nf_conntrack_tuple *tuple;
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 8b21762e65d..2a965c4a0ea 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -66,7 +66,7 @@ static int udp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
-		      int pf,
+		      u_int8_t pf,
 		      unsigned int hooknum)
 {
 	/* If we've seen traffic both ways, this is some kind of UDP
@@ -91,7 +91,7 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 static int udp_error(struct sk_buff *skb, unsigned int dataoff,
 		     enum ip_conntrack_info *ctinfo,
-		     int pf,
+		     u_int8_t pf,
 		     unsigned int hooknum)
 {
 	unsigned int udplen = skb->len - dataoff;
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 1fa62f3c24f..4fb6c8d83a8 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -65,7 +65,7 @@ static int udplite_packet(struct nf_conn *ct,
 			  const struct sk_buff *skb,
 			  unsigned int dataoff,
 			  enum ip_conntrack_info ctinfo,
-			  int pf,
+			  u_int8_t pf,
 			  unsigned int hooknum)
 {
 	/* If we've seen traffic both ways, this is some kind of UDP
@@ -91,7 +91,7 @@ static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 static int udplite_error(struct sk_buff *skb, unsigned int dataoff,
 			 enum ip_conntrack_info *ctinfo,
-			 int pf,
+			 u_int8_t pf,
 			 unsigned int hooknum)
 {
 	unsigned int udplen = skb->len - dataoff;
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 196269c1e58..bf6609978af 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -15,7 +15,7 @@
 /* core.c */
 extern unsigned int nf_iterate(struct list_head *head,
 				struct sk_buff *skb,
-				int hook,
+				unsigned int hook,
 				const struct net_device *indev,
 				const struct net_device *outdev,
 				struct list_head **i,
@@ -25,7 +25,7 @@ extern unsigned int nf_iterate(struct list_head *head,
 /* nf_queue.c */
 extern int nf_queue(struct sk_buff *skb,
 		    struct list_head *elem,
-		    int pf, unsigned int hook,
+		    u_int8_t pf, unsigned int hook,
 		    struct net_device *indev,
 		    struct net_device *outdev,
 		    int (*okfn)(struct sk_buff *),
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 9fda6ee95a3..5c2f7332015 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -20,7 +20,7 @@ static DEFINE_MUTEX(nf_log_mutex);
 
 /* return EBUSY if somebody else is registered, EEXIST if the same logger
  * is registred, 0 on success. */
-int nf_log_register(int pf, const struct nf_logger *logger)
+int nf_log_register(u_int8_t pf, const struct nf_logger *logger)
 {
 	int ret;
 
@@ -45,7 +45,7 @@ int nf_log_register(int pf, const struct nf_logger *logger)
 }
 EXPORT_SYMBOL(nf_log_register);
 
-void nf_log_unregister_pf(int pf)
+void nf_log_unregister_pf(u_int8_t pf)
 {
 	if (pf >= NPROTO)
 		return;
@@ -73,7 +73,7 @@ void nf_log_unregister(const struct nf_logger *logger)
 }
 EXPORT_SYMBOL(nf_log_unregister);
 
-void nf_log_packet(int pf,
+void nf_log_packet(u_int8_t pf,
 		   unsigned int hooknum,
 		   const struct sk_buff *skb,
 		   const struct net_device *in,
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 582ec3efc8a..f285086f629 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -22,7 +22,7 @@ static DEFINE_MUTEX(queue_handler_mutex);
 
 /* return EBUSY when somebody else is registered, return EEXIST if the
  * same handler is registered, return 0 in case of success. */
-int nf_register_queue_handler(int pf, const struct nf_queue_handler *qh)
+int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
 {
 	int ret;
 
@@ -45,7 +45,7 @@ int nf_register_queue_handler(int pf, const struct nf_queue_handler *qh)
 EXPORT_SYMBOL(nf_register_queue_handler);
 
 /* The caller must flush their queue before this */
-int nf_unregister_queue_handler(int pf, const struct nf_queue_handler *qh)
+int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
 {
 	if (pf >= NPROTO)
 		return -EINVAL;
@@ -67,7 +67,7 @@ EXPORT_SYMBOL(nf_unregister_queue_handler);
 
 void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
 {
-	int pf;
+	u_int8_t pf;
 
 	mutex_lock(&queue_handler_mutex);
 	for (pf = 0; pf < NPROTO; pf++)  {
@@ -107,7 +107,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
  */
 static int __nf_queue(struct sk_buff *skb,
 		      struct list_head *elem,
-		      int pf, unsigned int hook,
+		      u_int8_t pf, unsigned int hook,
 		      struct net_device *indev,
 		      struct net_device *outdev,
 		      int (*okfn)(struct sk_buff *),
@@ -191,7 +191,7 @@ err:
 
 int nf_queue(struct sk_buff *skb,
 	     struct list_head *elem,
-	     int pf, unsigned int hook,
+	     u_int8_t pf, unsigned int hook,
 	     struct net_device *indev,
 	     struct net_device *outdev,
 	     int (*okfn)(struct sk_buff *),
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 01489681fa9..f9b46de6a3d 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -60,7 +60,7 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
 }
 EXPORT_SYMBOL(nf_unregister_sockopt);
 
-static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf,
+static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
 		int val, int get)
 {
 	struct nf_sockopt_ops *ops;
@@ -96,7 +96,7 @@ out:
 }
 
 /* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, int pf, int val,
+static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
 		      char __user *opt, int *len, int get)
 {
 	struct nf_sockopt_ops *ops;
@@ -115,21 +115,22 @@ static int nf_sockopt(struct sock *sk, int pf, int val,
 	return ret;
 }
 
-int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
+int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
 		  int len)
 {
 	return nf_sockopt(sk, pf, val, opt, &len, 0);
 }
 EXPORT_SYMBOL(nf_setsockopt);
 
-int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
+int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
+		  int *len)
 {
 	return nf_sockopt(sk, pf, val, opt, len, 1);
 }
 EXPORT_SYMBOL(nf_getsockopt);
 
 #ifdef CONFIG_COMPAT
-static int compat_nf_sockopt(struct sock *sk, int pf, int val,
+static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
 			     char __user *opt, int *len, int get)
 {
 	struct nf_sockopt_ops *ops;
@@ -155,14 +156,14 @@ static int compat_nf_sockopt(struct sock *sk, int pf, int val,
 	return ret;
 }
 
-int compat_nf_setsockopt(struct sock *sk, int pf,
+int compat_nf_setsockopt(struct sock *sk, u_int8_t pf,
 		int val, char __user *opt, int len)
 {
 	return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
 }
 EXPORT_SYMBOL(compat_nf_setsockopt);
 
-int compat_nf_getsockopt(struct sock *sk, int pf,
+int compat_nf_getsockopt(struct sock *sk, u_int8_t pf,
 		int val, char __user *opt, int *len)
 {
 	return compat_nf_sockopt(sk, pf, val, opt, len, 1);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 9a35b57ab76..41e0105d382 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -359,7 +359,7 @@ static inline int
 __build_packet_message(struct nfulnl_instance *inst,
 			const struct sk_buff *skb,
 			unsigned int data_len,
-			unsigned int pf,
+			u_int8_t pf,
 			unsigned int hooknum,
 			const struct net_device *indev,
 			const struct net_device *outdev,
@@ -534,7 +534,7 @@ static struct nf_loginfo default_loginfo = {
 
 /* log handler for internal netfilter logging api */
 static void
-nfulnl_log_packet(unsigned int pf,
+nfulnl_log_packet(u_int8_t pf,
 		  unsigned int hooknum,
 		  const struct sk_buff *skb,
 		  const struct net_device *in,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 5d75cd86ebb..cf2f3e90cef 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -68,7 +68,8 @@ static const char *const xt_prefix[NPROTO] = {
 int
 xt_register_target(struct xt_target *target)
 {
-	int ret, af = target->family;
+	u_int8_t af = target->family;
+	int ret;
 
 	ret = mutex_lock_interruptible(&xt[af].mutex);
 	if (ret != 0)
@@ -82,7 +83,7 @@ EXPORT_SYMBOL(xt_register_target);
 void
 xt_unregister_target(struct xt_target *target)
 {
-	int af = target->family;
+	u_int8_t af = target->family;
 
 	mutex_lock(&xt[af].mutex);
 	list_del(&target->list);
@@ -123,7 +124,8 @@ EXPORT_SYMBOL(xt_unregister_targets);
 int
 xt_register_match(struct xt_match *match)
 {
-	int ret, af = match->family;
+	u_int8_t af = match->family;
+	int ret;
 
 	ret = mutex_lock_interruptible(&xt[af].mutex);
 	if (ret != 0)
@@ -139,7 +141,7 @@ EXPORT_SYMBOL(xt_register_match);
 void
 xt_unregister_match(struct xt_match *match)
 {
-	int af =  match->family;
+	u_int8_t af = match->family;
 
 	mutex_lock(&xt[af].mutex);
 	list_del(&match->list);
@@ -185,7 +187,7 @@ EXPORT_SYMBOL(xt_unregister_matches);
  */
 
 /* Find match, grabs ref.  Returns ERR_PTR() on error. */
-struct xt_match *xt_find_match(int af, const char *name, u8 revision)
+struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
 {
 	struct xt_match *m;
 	int err = 0;
@@ -210,7 +212,7 @@ struct xt_match *xt_find_match(int af, const char *name, u8 revision)
 EXPORT_SYMBOL(xt_find_match);
 
 /* Find target, grabs ref.  Returns ERR_PTR() on error. */
-struct xt_target *xt_find_target(int af, const char *name, u8 revision)
+struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
 {
 	struct xt_target *t;
 	int err = 0;
@@ -234,7 +236,7 @@ struct xt_target *xt_find_target(int af, const char *name, u8 revision)
 }
 EXPORT_SYMBOL(xt_find_target);
 
-struct xt_target *xt_request_find_target(int af, const char *name, u8 revision)
+struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
 {
 	struct xt_target *target;
 
@@ -246,7 +248,7 @@ struct xt_target *xt_request_find_target(int af, const char *name, u8 revision)
 }
 EXPORT_SYMBOL_GPL(xt_request_find_target);
 
-static int match_revfn(int af, const char *name, u8 revision, int *bestp)
+static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
 {
 	const struct xt_match *m;
 	int have_rev = 0;
@@ -262,7 +264,7 @@ static int match_revfn(int af, const char *name, u8 revision, int *bestp)
 	return have_rev;
 }
 
-static int target_revfn(int af, const char *name, u8 revision, int *bestp)
+static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
 {
 	const struct xt_target *t;
 	int have_rev = 0;
@@ -279,7 +281,7 @@ static int target_revfn(int af, const char *name, u8 revision, int *bestp)
 }
 
 /* Returns true or false (if no such extension at all) */
-int xt_find_revision(int af, const char *name, u8 revision, int target,
+int xt_find_revision(u8 af, const char *name, u8 revision, int target,
 		     int *err)
 {
 	int have_rev, best = -1;
@@ -337,7 +339,7 @@ int xt_check_match(const struct xt_match *match, unsigned short family,
 EXPORT_SYMBOL_GPL(xt_check_match);
 
 #ifdef CONFIG_COMPAT
-int xt_compat_add_offset(int af, unsigned int offset, short delta)
+int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta)
 {
 	struct compat_delta *tmp;
 
@@ -359,7 +361,7 @@ int xt_compat_add_offset(int af, unsigned int offset, short delta)
 }
 EXPORT_SYMBOL_GPL(xt_compat_add_offset);
 
-void xt_compat_flush_offsets(int af)
+void xt_compat_flush_offsets(u_int8_t af)
 {
 	struct compat_delta *tmp, *next;
 
@@ -373,7 +375,7 @@ void xt_compat_flush_offsets(int af)
 }
 EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
 
-short xt_compat_calc_jump(int af, unsigned int offset)
+short xt_compat_calc_jump(u_int8_t af, unsigned int offset)
 {
 	struct compat_delta *tmp;
 	short delta;
@@ -590,7 +592,8 @@ void xt_free_table_info(struct xt_table_info *info)
 EXPORT_SYMBOL(xt_free_table_info);
 
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
-struct xt_table *xt_find_table_lock(struct net *net, int af, const char *name)
+struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
+				    const char *name)
 {
 	struct xt_table *t;
 
@@ -612,13 +615,13 @@ void xt_table_unlock(struct xt_table *table)
 EXPORT_SYMBOL_GPL(xt_table_unlock);
 
 #ifdef CONFIG_COMPAT
-void xt_compat_lock(int af)
+void xt_compat_lock(u_int8_t af)
 {
 	mutex_lock(&xt[af].compat_mutex);
 }
 EXPORT_SYMBOL_GPL(xt_compat_lock);
 
-void xt_compat_unlock(int af)
+void xt_compat_unlock(u_int8_t af)
 {
 	mutex_unlock(&xt[af].compat_mutex);
 }
@@ -722,13 +725,13 @@ EXPORT_SYMBOL_GPL(xt_unregister_table);
 #ifdef CONFIG_PROC_FS
 struct xt_names_priv {
 	struct seq_net_private p;
-	int af;
+	u_int8_t af;
 };
 static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct xt_names_priv *priv = seq->private;
 	struct net *net = seq_file_net(seq);
-	int af = priv->af;
+	u_int8_t af = priv->af;
 
 	mutex_lock(&xt[af].mutex);
 	return seq_list_start(&net->xt.tables[af], *pos);
@@ -738,7 +741,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct xt_names_priv *priv = seq->private;
 	struct net *net = seq_file_net(seq);
-	int af = priv->af;
+	u_int8_t af = priv->af;
 
 	return seq_list_next(v, &net->xt.tables[af], pos);
 }
@@ -746,7 +749,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static void xt_table_seq_stop(struct seq_file *seq, void *v)
 {
 	struct xt_names_priv *priv = seq->private;
-	int af = priv->af;
+	u_int8_t af = priv->af;
 
 	mutex_unlock(&xt[af].mutex);
 }
@@ -922,7 +925,7 @@ static const struct file_operations xt_target_ops = {
 
 #endif /* CONFIG_PROC_FS */
 
-int xt_proto_init(struct net *net, int af)
+int xt_proto_init(struct net *net, u_int8_t af)
 {
 #ifdef CONFIG_PROC_FS
 	char buf[XT_FUNCTION_MAXNAMELEN];
@@ -974,7 +977,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(xt_proto_init);
 
-void xt_proto_fini(struct net *net, int af)
+void xt_proto_fini(struct net *net, u_int8_t af)
 {
 #ifdef CONFIG_PROC_FS
 	char buf[XT_FUNCTION_MAXNAMELEN];
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 70907f6baac..1655e2cf25c 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -82,7 +82,7 @@ static inline bool already_closed(const struct nf_conn *conn)
 static inline unsigned int
 same_source_net(const union nf_inet_addr *addr,
 		const union nf_inet_addr *mask,
-		const union nf_inet_addr *u3, unsigned int family)
+		const union nf_inet_addr *u3, u_int8_t family)
 {
 	if (family == AF_INET) {
 		return (addr->ip & mask->ip) == (u3->ip & mask->ip);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index d61412f58ef..28a42a3fbff 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -133,7 +133,7 @@ conntrack_addrcmp(const union nf_inet_addr *kaddr,
 static inline bool
 conntrack_mt_origsrc(const struct nf_conn *ct,
                      const struct xt_conntrack_mtinfo1 *info,
-                     unsigned int family)
+		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
 	       &info->origsrc_addr, &info->origsrc_mask, family);
@@ -142,7 +142,7 @@ conntrack_mt_origsrc(const struct nf_conn *ct,
 static inline bool
 conntrack_mt_origdst(const struct nf_conn *ct,
                      const struct xt_conntrack_mtinfo1 *info,
-                     unsigned int family)
+		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3,
 	       &info->origdst_addr, &info->origdst_mask, family);
@@ -151,7 +151,7 @@ conntrack_mt_origdst(const struct nf_conn *ct,
 static inline bool
 conntrack_mt_replsrc(const struct nf_conn *ct,
                      const struct xt_conntrack_mtinfo1 *info,
-                     unsigned int family)
+		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3,
 	       &info->replsrc_addr, &info->replsrc_mask, family);
@@ -160,7 +160,7 @@ conntrack_mt_replsrc(const struct nf_conn *ct,
 static inline bool
 conntrack_mt_repldst(const struct nf_conn *ct,
                      const struct xt_conntrack_mtinfo1 *info,
-                     unsigned int family)
+		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3,
 	       &info->repldst_addr, &info->repldst_mask, family);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index d9418a26781..0c9268fd2e1 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -80,7 +80,7 @@ struct dsthash_ent {
 struct xt_hashlimit_htable {
 	struct hlist_node node;		/* global list of all htables */
 	atomic_t use;
-	int family;
+	u_int8_t family;
 
 	struct hashlimit_cfg1 cfg;	/* config */
 
@@ -185,7 +185,7 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(unsigned long htlong);
 
-static int htable_create_v0(struct xt_hashlimit_info *minfo, int family)
+static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family)
 {
 	struct xt_hashlimit_htable *hinfo;
 	unsigned int size;
@@ -258,8 +258,7 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, int family)
 	return 0;
 }
 
-static int htable_create(struct xt_hashlimit_mtinfo1 *minfo,
-                         unsigned int family)
+static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family)
 {
 	struct xt_hashlimit_htable *hinfo;
 	unsigned int size;
@@ -378,7 +377,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo)
 }
 
 static struct xt_hashlimit_htable *htable_find_get(const char *name,
-						   int family)
+						   u_int8_t family)
 {
 	struct xt_hashlimit_htable *hinfo;
 	struct hlist_node *pos;
@@ -901,7 +900,7 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 	spin_unlock_bh(&htable->lock);
 }
 
-static int dl_seq_real_show(struct dsthash_ent *ent, int family,
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				   struct seq_file *s)
 {
 	/* recalculate to show accurate numbers */
-- 
cgit v1.2.3


From e948b20a71a06a740c925d6ea22b59b4e17cfa0c Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Wed, 8 Oct 2008 11:35:00 +0200
Subject: netfilter: rename ipt_recent to xt_recent

Like with other modules (such as ipt_state), ipt_recent.h is changed
to forward definitions to (IOW include) xt_recent.h, and xt_recent.c
is changed to use the new constant names.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig      |  13 --
 net/ipv4/netfilter/Makefile     |   1 -
 net/ipv4/netfilter/ipt_recent.c | 501 ---------------------------------------
 net/netfilter/Kconfig           |  11 +
 net/netfilter/Makefile          |   1 +
 net/netfilter/xt_recent.c       | 502 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 514 insertions(+), 515 deletions(-)
 delete mode 100644 net/ipv4/netfilter/ipt_recent.c
 create mode 100644 net/netfilter/xt_recent.c

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 90eb7cb47e7..4e842d56642 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -57,19 +57,6 @@ config IP_NF_IPTABLES
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 # The matches.
-config IP_NF_MATCH_RECENT
-	tristate '"recent" match support'
-	depends on IP_NF_IPTABLES
-	depends on NETFILTER_ADVANCED
-	help
-	  This match is used for creating one or many lists of recently
-	  used addresses and then matching against that/those list(s).
-
-	  Short options are available by using 'iptables -m recent -h'
-	  Official Website: <http://snowman.net/projects/ipt_recent/>
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_ECN
 	tristate '"ecn" match support'
 	depends on IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3f31291f37c..1107edbe478 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
 obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
-obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
 obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
 
 # targets
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
deleted file mode 100644
index 3974d7cae5c..00000000000
--- a/net/ipv4/netfilter/ipt_recent.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This is a replacement of the old ipt_recent module, which carried the
- * following copyright notice:
- *
- * Author: Stephen Frost <sfrost@snowman.net>
- * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
- */
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/moduleparam.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/random.h>
-#include <linux/jhash.h>
-#include <linux/bitops.h>
-#include <linux/skbuff.h>
-#include <linux/inet.h>
-#include <net/net_namespace.h>
-
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_recent.h>
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4");
-MODULE_LICENSE("GPL");
-
-static unsigned int ip_list_tot = 100;
-static unsigned int ip_pkt_list_tot = 20;
-static unsigned int ip_list_hash_size = 0;
-static unsigned int ip_list_perms = 0644;
-static unsigned int ip_list_uid = 0;
-static unsigned int ip_list_gid = 0;
-module_param(ip_list_tot, uint, 0400);
-module_param(ip_pkt_list_tot, uint, 0400);
-module_param(ip_list_hash_size, uint, 0400);
-module_param(ip_list_perms, uint, 0400);
-module_param(ip_list_uid, uint, 0400);
-module_param(ip_list_gid, uint, 0400);
-MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
-MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
-MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
-MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files");
-
-struct recent_entry {
-	struct list_head	list;
-	struct list_head	lru_list;
-	__be32			addr;
-	u_int8_t		ttl;
-	u_int8_t		index;
-	u_int16_t		nstamps;
-	unsigned long		stamps[0];
-};
-
-struct recent_table {
-	struct list_head	list;
-	char			name[IPT_RECENT_NAME_LEN];
-#ifdef CONFIG_PROC_FS
-	struct proc_dir_entry	*proc;
-#endif
-	unsigned int		refcnt;
-	unsigned int		entries;
-	struct list_head	lru_list;
-	struct list_head	iphash[0];
-};
-
-static LIST_HEAD(tables);
-static DEFINE_SPINLOCK(recent_lock);
-static DEFINE_MUTEX(recent_mutex);
-
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry	*proc_dir;
-static const struct file_operations	recent_fops;
-#endif
-
-static u_int32_t hash_rnd;
-static int hash_rnd_initted;
-
-static unsigned int recent_entry_hash(__be32 addr)
-{
-	if (!hash_rnd_initted) {
-		get_random_bytes(&hash_rnd, 4);
-		hash_rnd_initted = 1;
-	}
-	return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1);
-}
-
-static struct recent_entry *
-recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl)
-{
-	struct recent_entry *e;
-	unsigned int h;
-
-	h = recent_entry_hash(addr);
-	list_for_each_entry(e, &table->iphash[h], list)
-		if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
-			return e;
-	return NULL;
-}
-
-static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
-{
-	list_del(&e->list);
-	list_del(&e->lru_list);
-	kfree(e);
-	t->entries--;
-}
-
-static struct recent_entry *
-recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
-{
-	struct recent_entry *e;
-
-	if (t->entries >= ip_list_tot) {
-		e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
-		recent_entry_remove(t, e);
-	}
-	e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
-		    GFP_ATOMIC);
-	if (e == NULL)
-		return NULL;
-	e->addr      = addr;
-	e->ttl       = ttl;
-	e->stamps[0] = jiffies;
-	e->nstamps   = 1;
-	e->index     = 1;
-	list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
-	list_add_tail(&e->lru_list, &t->lru_list);
-	t->entries++;
-	return e;
-}
-
-static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
-{
-	e->stamps[e->index++] = jiffies;
-	if (e->index > e->nstamps)
-		e->nstamps = e->index;
-	e->index %= ip_pkt_list_tot;
-	list_move_tail(&e->lru_list, &t->lru_list);
-}
-
-static struct recent_table *recent_table_lookup(const char *name)
-{
-	struct recent_table *t;
-
-	list_for_each_entry(t, &tables, list)
-		if (!strcmp(t->name, name))
-			return t;
-	return NULL;
-}
-
-static void recent_table_flush(struct recent_table *t)
-{
-	struct recent_entry *e, *next;
-	unsigned int i;
-
-	for (i = 0; i < ip_list_hash_size; i++)
-		list_for_each_entry_safe(e, next, &t->iphash[i], list)
-			recent_entry_remove(t, e);
-}
-
-static bool
-recent_mt(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
-{
-	const struct ipt_recent_info *info = matchinfo;
-	struct recent_table *t;
-	struct recent_entry *e;
-	__be32 addr;
-	u_int8_t ttl;
-	bool ret = info->invert;
-
-	if (info->side == IPT_RECENT_DEST)
-		addr = ip_hdr(skb)->daddr;
-	else
-		addr = ip_hdr(skb)->saddr;
-
-	ttl = ip_hdr(skb)->ttl;
-	/* use TTL as seen before forwarding */
-	if (out && !skb->sk)
-		ttl++;
-
-	spin_lock_bh(&recent_lock);
-	t = recent_table_lookup(info->name);
-	e = recent_entry_lookup(t, addr,
-				info->check_set & IPT_RECENT_TTL ? ttl : 0);
-	if (e == NULL) {
-		if (!(info->check_set & IPT_RECENT_SET))
-			goto out;
-		e = recent_entry_init(t, addr, ttl);
-		if (e == NULL)
-			*hotdrop = true;
-		ret = !ret;
-		goto out;
-	}
-
-	if (info->check_set & IPT_RECENT_SET)
-		ret = !ret;
-	else if (info->check_set & IPT_RECENT_REMOVE) {
-		recent_entry_remove(t, e);
-		ret = !ret;
-	} else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
-		unsigned long time = jiffies - info->seconds * HZ;
-		unsigned int i, hits = 0;
-
-		for (i = 0; i < e->nstamps; i++) {
-			if (info->seconds && time_after(time, e->stamps[i]))
-				continue;
-			if (++hits >= info->hit_count) {
-				ret = !ret;
-				break;
-			}
-		}
-	}
-
-	if (info->check_set & IPT_RECENT_SET ||
-	    (info->check_set & IPT_RECENT_UPDATE && ret)) {
-		recent_entry_update(t, e);
-		e->ttl = ttl;
-	}
-out:
-	spin_unlock_bh(&recent_lock);
-	return ret;
-}
-
-static bool
-recent_mt_check(const char *tablename, const void *ip,
-                const struct xt_match *match, void *matchinfo,
-                unsigned int hook_mask)
-{
-	const struct ipt_recent_info *info = matchinfo;
-	struct recent_table *t;
-	unsigned i;
-	bool ret = false;
-
-	if (hweight8(info->check_set &
-		     (IPT_RECENT_SET | IPT_RECENT_REMOVE |
-		      IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
-		return false;
-	if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
-	    (info->seconds || info->hit_count))
-		return false;
-	if (info->hit_count > ip_pkt_list_tot)
-		return false;
-	if (info->name[0] == '\0' ||
-	    strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
-		return false;
-
-	mutex_lock(&recent_mutex);
-	t = recent_table_lookup(info->name);
-	if (t != NULL) {
-		t->refcnt++;
-		ret = true;
-		goto out;
-	}
-
-	t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
-		    GFP_KERNEL);
-	if (t == NULL)
-		goto out;
-	t->refcnt = 1;
-	strcpy(t->name, info->name);
-	INIT_LIST_HEAD(&t->lru_list);
-	for (i = 0; i < ip_list_hash_size; i++)
-		INIT_LIST_HEAD(&t->iphash[i]);
-#ifdef CONFIG_PROC_FS
-	t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops);
-	if (t->proc == NULL) {
-		kfree(t);
-		goto out;
-	}
-	t->proc->uid       = ip_list_uid;
-	t->proc->gid       = ip_list_gid;
-	t->proc->data      = t;
-#endif
-	spin_lock_bh(&recent_lock);
-	list_add_tail(&t->list, &tables);
-	spin_unlock_bh(&recent_lock);
-	ret = true;
-out:
-	mutex_unlock(&recent_mutex);
-	return ret;
-}
-
-static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
-{
-	const struct ipt_recent_info *info = matchinfo;
-	struct recent_table *t;
-
-	mutex_lock(&recent_mutex);
-	t = recent_table_lookup(info->name);
-	if (--t->refcnt == 0) {
-		spin_lock_bh(&recent_lock);
-		list_del(&t->list);
-		spin_unlock_bh(&recent_lock);
-#ifdef CONFIG_PROC_FS
-		remove_proc_entry(t->name, proc_dir);
-#endif
-		recent_table_flush(t);
-		kfree(t);
-	}
-	mutex_unlock(&recent_mutex);
-}
-
-#ifdef CONFIG_PROC_FS
-struct recent_iter_state {
-	struct recent_table	*table;
-	unsigned int		bucket;
-};
-
-static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(recent_lock)
-{
-	struct recent_iter_state *st = seq->private;
-	const struct recent_table *t = st->table;
-	struct recent_entry *e;
-	loff_t p = *pos;
-
-	spin_lock_bh(&recent_lock);
-
-	for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++)
-		list_for_each_entry(e, &t->iphash[st->bucket], list)
-			if (p-- == 0)
-				return e;
-	return NULL;
-}
-
-static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct recent_iter_state *st = seq->private;
-	const struct recent_table *t = st->table;
-	struct recent_entry *e = v;
-	struct list_head *head = e->list.next;
-
-	while (head == &t->iphash[st->bucket]) {
-		if (++st->bucket >= ip_list_hash_size)
-			return NULL;
-		head = t->iphash[st->bucket].next;
-	}
-	(*pos)++;
-	return list_entry(head, struct recent_entry, list);
-}
-
-static void recent_seq_stop(struct seq_file *s, void *v)
-	__releases(recent_lock)
-{
-	spin_unlock_bh(&recent_lock);
-}
-
-static int recent_seq_show(struct seq_file *seq, void *v)
-{
-	const struct recent_entry *e = v;
-	unsigned int i;
-
-	i = (e->index - 1) % ip_pkt_list_tot;
-	seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
-		   NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
-	for (i = 0; i < e->nstamps; i++)
-		seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
-	seq_printf(seq, "\n");
-	return 0;
-}
-
-static const struct seq_operations recent_seq_ops = {
-	.start		= recent_seq_start,
-	.next		= recent_seq_next,
-	.stop		= recent_seq_stop,
-	.show		= recent_seq_show,
-};
-
-static int recent_seq_open(struct inode *inode, struct file *file)
-{
-	struct proc_dir_entry *pde = PDE(inode);
-	struct recent_iter_state *st;
-
-	st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
-	if (st == NULL)
-		return -ENOMEM;
-
-	st->table    = pde->data;
-	return 0;
-}
-
-static ssize_t recent_proc_write(struct file *file, const char __user *input,
-				 size_t size, loff_t *loff)
-{
-	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	struct recent_table *t = pde->data;
-	struct recent_entry *e;
-	char buf[sizeof("+255.255.255.255")], *c = buf;
-	__be32 addr;
-	int add;
-
-	if (size > sizeof(buf))
-		size = sizeof(buf);
-	if (copy_from_user(buf, input, size))
-		return -EFAULT;
-	while (isspace(*c))
-		c++;
-
-	if (size - (c - buf) < 5)
-		return c - buf;
-	if (!strncmp(c, "clear", 5)) {
-		c += 5;
-		spin_lock_bh(&recent_lock);
-		recent_table_flush(t);
-		spin_unlock_bh(&recent_lock);
-		return c - buf;
-	}
-
-	switch (*c) {
-	case '-':
-		add = 0;
-		c++;
-		break;
-	case '+':
-		c++;
-	default:
-		add = 1;
-		break;
-	}
-	addr = in_aton(c);
-
-	spin_lock_bh(&recent_lock);
-	e = recent_entry_lookup(t, addr, 0);
-	if (e == NULL) {
-		if (add)
-			recent_entry_init(t, addr, 0);
-	} else {
-		if (add)
-			recent_entry_update(t, e);
-		else
-			recent_entry_remove(t, e);
-	}
-	spin_unlock_bh(&recent_lock);
-	return size;
-}
-
-static const struct file_operations recent_fops = {
-	.open		= recent_seq_open,
-	.read		= seq_read,
-	.write		= recent_proc_write,
-	.release	= seq_release_private,
-	.owner		= THIS_MODULE,
-};
-#endif /* CONFIG_PROC_FS */
-
-static struct xt_match recent_mt_reg __read_mostly = {
-	.name		= "recent",
-	.family		= AF_INET,
-	.match		= recent_mt,
-	.matchsize	= sizeof(struct ipt_recent_info),
-	.checkentry	= recent_mt_check,
-	.destroy	= recent_mt_destroy,
-	.me		= THIS_MODULE,
-};
-
-static int __init recent_mt_init(void)
-{
-	int err;
-
-	if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
-		return -EINVAL;
-	ip_list_hash_size = 1 << fls(ip_list_tot);
-
-	err = xt_register_match(&recent_mt_reg);
-#ifdef CONFIG_PROC_FS
-	if (err)
-		return err;
-	proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
-	if (proc_dir == NULL) {
-		xt_unregister_match(&recent_mt_reg);
-		err = -ENOMEM;
-	}
-#endif
-	return err;
-}
-
-static void __exit recent_mt_exit(void)
-{
-	BUG_ON(!list_empty(&tables));
-	xt_unregister_match(&recent_mt_reg);
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("ipt_recent", init_net.proc_net);
-#endif
-}
-
-module_init(recent_mt_init);
-module_exit(recent_mt_exit);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ee898e74808..ccc78b07a1a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -732,6 +732,17 @@ config NETFILTER_XT_MATCH_REALM
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_MATCH_RECENT
+	tristate '"recent" match support'
+	depends on NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	---help---
+	This match is used for creating one or many lists of recently
+	used addresses and then matching against that/those list(s).
+
+	Short options are available by using 'iptables -m recent -h'
+	Official Website: <http://snowman.net/projects/ipt_recent/>
+
 config NETFILTER_XT_MATCH_SCTP
 	tristate  '"sctp" protocol match support (EXPERIMENTAL)'
 	depends on NETFILTER_XTABLES && EXPERIMENTAL
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3bd2cc556ae..f101cf61e6f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -76,6 +76,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
new file mode 100644
index 00000000000..422c0e4d66b
--- /dev/null
+++ b/net/netfilter/xt_recent.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This is a replacement of the old ipt_recent module, which carried the
+ * following copyright notice:
+ *
+ * Author: Stephen Frost <sfrost@snowman.net>
+ * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
+ */
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/moduleparam.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <net/net_namespace.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_recent.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_recent");
+
+static unsigned int ip_list_tot = 100;
+static unsigned int ip_pkt_list_tot = 20;
+static unsigned int ip_list_hash_size = 0;
+static unsigned int ip_list_perms = 0644;
+static unsigned int ip_list_uid = 0;
+static unsigned int ip_list_gid = 0;
+module_param(ip_list_tot, uint, 0400);
+module_param(ip_pkt_list_tot, uint, 0400);
+module_param(ip_list_hash_size, uint, 0400);
+module_param(ip_list_perms, uint, 0400);
+module_param(ip_list_uid, uint, 0400);
+module_param(ip_list_gid, uint, 0400);
+MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
+MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
+MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
+MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files");
+MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files");
+
+struct recent_entry {
+	struct list_head	list;
+	struct list_head	lru_list;
+	__be32			addr;
+	u_int8_t		ttl;
+	u_int8_t		index;
+	u_int16_t		nstamps;
+	unsigned long		stamps[0];
+};
+
+struct recent_table {
+	struct list_head	list;
+	char			name[XT_RECENT_NAME_LEN];
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry	*proc;
+#endif
+	unsigned int		refcnt;
+	unsigned int		entries;
+	struct list_head	lru_list;
+	struct list_head	iphash[0];
+};
+
+static LIST_HEAD(tables);
+static DEFINE_SPINLOCK(recent_lock);
+static DEFINE_MUTEX(recent_mutex);
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry	*proc_dir;
+static const struct file_operations	recent_fops;
+#endif
+
+static u_int32_t hash_rnd;
+static int hash_rnd_initted;
+
+static unsigned int recent_entry_hash(__be32 addr)
+{
+	if (!hash_rnd_initted) {
+		get_random_bytes(&hash_rnd, 4);
+		hash_rnd_initted = 1;
+	}
+	return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1);
+}
+
+static struct recent_entry *
+recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl)
+{
+	struct recent_entry *e;
+	unsigned int h;
+
+	h = recent_entry_hash(addr);
+	list_for_each_entry(e, &table->iphash[h], list)
+		if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
+			return e;
+	return NULL;
+}
+
+static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
+{
+	list_del(&e->list);
+	list_del(&e->lru_list);
+	kfree(e);
+	t->entries--;
+}
+
+static struct recent_entry *
+recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
+{
+	struct recent_entry *e;
+
+	if (t->entries >= ip_list_tot) {
+		e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
+		recent_entry_remove(t, e);
+	}
+	e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
+		    GFP_ATOMIC);
+	if (e == NULL)
+		return NULL;
+	e->addr      = addr;
+	e->ttl       = ttl;
+	e->stamps[0] = jiffies;
+	e->nstamps   = 1;
+	e->index     = 1;
+	list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
+	list_add_tail(&e->lru_list, &t->lru_list);
+	t->entries++;
+	return e;
+}
+
+static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
+{
+	e->stamps[e->index++] = jiffies;
+	if (e->index > e->nstamps)
+		e->nstamps = e->index;
+	e->index %= ip_pkt_list_tot;
+	list_move_tail(&e->lru_list, &t->lru_list);
+}
+
+static struct recent_table *recent_table_lookup(const char *name)
+{
+	struct recent_table *t;
+
+	list_for_each_entry(t, &tables, list)
+		if (!strcmp(t->name, name))
+			return t;
+	return NULL;
+}
+
+static void recent_table_flush(struct recent_table *t)
+{
+	struct recent_entry *e, *next;
+	unsigned int i;
+
+	for (i = 0; i < ip_list_hash_size; i++)
+		list_for_each_entry_safe(e, next, &t->iphash[i], list)
+			recent_entry_remove(t, e);
+}
+
+static bool
+recent_mt(const struct sk_buff *skb, const struct net_device *in,
+          const struct net_device *out, const struct xt_match *match,
+          const void *matchinfo, int offset, unsigned int protoff,
+          bool *hotdrop)
+{
+	const struct xt_recent_mtinfo *info = matchinfo;
+	struct recent_table *t;
+	struct recent_entry *e;
+	__be32 addr;
+	u_int8_t ttl;
+	bool ret = info->invert;
+
+	if (info->side == XT_RECENT_DEST)
+		addr = ip_hdr(skb)->daddr;
+	else
+		addr = ip_hdr(skb)->saddr;
+
+	ttl = ip_hdr(skb)->ttl;
+	/* use TTL as seen before forwarding */
+	if (out && !skb->sk)
+		ttl++;
+
+	spin_lock_bh(&recent_lock);
+	t = recent_table_lookup(info->name);
+	e = recent_entry_lookup(t, addr,
+				info->check_set & XT_RECENT_TTL ? ttl : 0);
+	if (e == NULL) {
+		if (!(info->check_set & XT_RECENT_SET))
+			goto out;
+		e = recent_entry_init(t, addr, ttl);
+		if (e == NULL)
+			*hotdrop = true;
+		ret = !ret;
+		goto out;
+	}
+
+	if (info->check_set & XT_RECENT_SET)
+		ret = !ret;
+	else if (info->check_set & XT_RECENT_REMOVE) {
+		recent_entry_remove(t, e);
+		ret = !ret;
+	} else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) {
+		unsigned long time = jiffies - info->seconds * HZ;
+		unsigned int i, hits = 0;
+
+		for (i = 0; i < e->nstamps; i++) {
+			if (info->seconds && time_after(time, e->stamps[i]))
+				continue;
+			if (++hits >= info->hit_count) {
+				ret = !ret;
+				break;
+			}
+		}
+	}
+
+	if (info->check_set & XT_RECENT_SET ||
+	    (info->check_set & XT_RECENT_UPDATE && ret)) {
+		recent_entry_update(t, e);
+		e->ttl = ttl;
+	}
+out:
+	spin_unlock_bh(&recent_lock);
+	return ret;
+}
+
+static bool
+recent_mt_check(const char *tablename, const void *ip,
+                const struct xt_match *match, void *matchinfo,
+                unsigned int hook_mask)
+{
+	const struct xt_recent_mtinfo *info = matchinfo;
+	struct recent_table *t;
+	unsigned i;
+	bool ret = false;
+
+	if (hweight8(info->check_set &
+		     (XT_RECENT_SET | XT_RECENT_REMOVE |
+		      XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1)
+		return false;
+	if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) &&
+	    (info->seconds || info->hit_count))
+		return false;
+	if (info->hit_count > ip_pkt_list_tot)
+		return false;
+	if (info->name[0] == '\0' ||
+	    strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN)
+		return false;
+
+	mutex_lock(&recent_mutex);
+	t = recent_table_lookup(info->name);
+	if (t != NULL) {
+		t->refcnt++;
+		ret = true;
+		goto out;
+	}
+
+	t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
+		    GFP_KERNEL);
+	if (t == NULL)
+		goto out;
+	t->refcnt = 1;
+	strcpy(t->name, info->name);
+	INIT_LIST_HEAD(&t->lru_list);
+	for (i = 0; i < ip_list_hash_size; i++)
+		INIT_LIST_HEAD(&t->iphash[i]);
+#ifdef CONFIG_PROC_FS
+	t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops);
+	if (t->proc == NULL) {
+		kfree(t);
+		goto out;
+	}
+	t->proc->uid       = ip_list_uid;
+	t->proc->gid       = ip_list_gid;
+	t->proc->data      = t;
+#endif
+	spin_lock_bh(&recent_lock);
+	list_add_tail(&t->list, &tables);
+	spin_unlock_bh(&recent_lock);
+	ret = true;
+out:
+	mutex_unlock(&recent_mutex);
+	return ret;
+}
+
+static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
+{
+	const struct xt_recent_mtinfo *info = matchinfo;
+	struct recent_table *t;
+
+	mutex_lock(&recent_mutex);
+	t = recent_table_lookup(info->name);
+	if (--t->refcnt == 0) {
+		spin_lock_bh(&recent_lock);
+		list_del(&t->list);
+		spin_unlock_bh(&recent_lock);
+#ifdef CONFIG_PROC_FS
+		remove_proc_entry(t->name, proc_dir);
+#endif
+		recent_table_flush(t);
+		kfree(t);
+	}
+	mutex_unlock(&recent_mutex);
+}
+
+#ifdef CONFIG_PROC_FS
+struct recent_iter_state {
+	struct recent_table	*table;
+	unsigned int		bucket;
+};
+
+static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(recent_lock)
+{
+	struct recent_iter_state *st = seq->private;
+	const struct recent_table *t = st->table;
+	struct recent_entry *e;
+	loff_t p = *pos;
+
+	spin_lock_bh(&recent_lock);
+
+	for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++)
+		list_for_each_entry(e, &t->iphash[st->bucket], list)
+			if (p-- == 0)
+				return e;
+	return NULL;
+}
+
+static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct recent_iter_state *st = seq->private;
+	const struct recent_table *t = st->table;
+	struct recent_entry *e = v;
+	struct list_head *head = e->list.next;
+
+	while (head == &t->iphash[st->bucket]) {
+		if (++st->bucket >= ip_list_hash_size)
+			return NULL;
+		head = t->iphash[st->bucket].next;
+	}
+	(*pos)++;
+	return list_entry(head, struct recent_entry, list);
+}
+
+static void recent_seq_stop(struct seq_file *s, void *v)
+	__releases(recent_lock)
+{
+	spin_unlock_bh(&recent_lock);
+}
+
+static int recent_seq_show(struct seq_file *seq, void *v)
+{
+	const struct recent_entry *e = v;
+	unsigned int i;
+
+	i = (e->index - 1) % ip_pkt_list_tot;
+	seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
+		   NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
+	for (i = 0; i < e->nstamps; i++)
+		seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
+	seq_printf(seq, "\n");
+	return 0;
+}
+
+static const struct seq_operations recent_seq_ops = {
+	.start		= recent_seq_start,
+	.next		= recent_seq_next,
+	.stop		= recent_seq_stop,
+	.show		= recent_seq_show,
+};
+
+static int recent_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct recent_iter_state *st;
+
+	st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
+	if (st == NULL)
+		return -ENOMEM;
+
+	st->table    = pde->data;
+	return 0;
+}
+
+static ssize_t recent_proc_write(struct file *file, const char __user *input,
+				 size_t size, loff_t *loff)
+{
+	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct recent_table *t = pde->data;
+	struct recent_entry *e;
+	char buf[sizeof("+255.255.255.255")], *c = buf;
+	__be32 addr;
+	int add;
+
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+	if (copy_from_user(buf, input, size))
+		return -EFAULT;
+	while (isspace(*c))
+		c++;
+
+	if (size - (c - buf) < 5)
+		return c - buf;
+	if (!strncmp(c, "clear", 5)) {
+		c += 5;
+		spin_lock_bh(&recent_lock);
+		recent_table_flush(t);
+		spin_unlock_bh(&recent_lock);
+		return c - buf;
+	}
+
+	switch (*c) {
+	case '-':
+		add = 0;
+		c++;
+		break;
+	case '+':
+		c++;
+	default:
+		add = 1;
+		break;
+	}
+	addr = in_aton(c);
+
+	spin_lock_bh(&recent_lock);
+	e = recent_entry_lookup(t, addr, 0);
+	if (e == NULL) {
+		if (add)
+			recent_entry_init(t, addr, 0);
+	} else {
+		if (add)
+			recent_entry_update(t, e);
+		else
+			recent_entry_remove(t, e);
+	}
+	spin_unlock_bh(&recent_lock);
+	return size;
+}
+
+static const struct file_operations recent_fops = {
+	.open		= recent_seq_open,
+	.read		= seq_read,
+	.write		= recent_proc_write,
+	.release	= seq_release_private,
+	.owner		= THIS_MODULE,
+};
+#endif /* CONFIG_PROC_FS */
+
+static struct xt_match recent_mt_reg __read_mostly = {
+	.name		= "recent",
+	.family		= AF_INET,
+	.match		= recent_mt,
+	.matchsize	= sizeof(struct xt_recent_mtinfo),
+	.checkentry	= recent_mt_check,
+	.destroy	= recent_mt_destroy,
+	.me		= THIS_MODULE,
+};
+
+static int __init recent_mt_init(void)
+{
+	int err;
+
+	if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
+		return -EINVAL;
+	ip_list_hash_size = 1 << fls(ip_list_tot);
+
+	err = xt_register_match(&recent_mt_reg);
+#ifdef CONFIG_PROC_FS
+	if (err)
+		return err;
+	proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
+	if (proc_dir == NULL) {
+		xt_unregister_match(&recent_mt_reg);
+		err = -ENOMEM;
+	}
+#endif
+	return err;
+}
+
+static void __exit recent_mt_exit(void)
+{
+	BUG_ON(!list_empty(&tables));
+	xt_unregister_match(&recent_mt_reg);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("ipt_recent", init_net.proc_net);
+#endif
+}
+
+module_init(recent_mt_init);
+module_exit(recent_mt_exit);
-- 
cgit v1.2.3


From 079aa88fe7172b7650c7cf2c0bc01662bafea236 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:00 +0200
Subject: netfilter: xt_recent: IPv6 support

This updates xt_recent to support the IPv6 address family.
The new /proc/net/xt_recent directory must be used for this.
The old proc interface can also be configured out.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/Kconfig     |   7 ++
 net/netfilter/xt_recent.c | 300 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 253 insertions(+), 54 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ccc78b07a1a..4a464857f21 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -743,6 +743,13 @@ config NETFILTER_XT_MATCH_RECENT
 	Short options are available by using 'iptables -m recent -h'
 	Official Website: <http://snowman.net/projects/ipt_recent/>
 
+config NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+	bool 'Enable obsolete /proc/net/ipt_recent'
+	depends on NETFILTER_XT_MATCH_RECENT && PROC_FS
+	---help---
+	This option enables the old /proc/net/ipt_recent interface,
+	which has been obsoleted by /proc/net/xt_recent.
+
 config NETFILTER_XT_MATCH_SCTP
 	tristate  '"sctp" protocol match support (EXPERIMENTAL)'
 	depends on NETFILTER_XTABLES && EXPERIMENTAL
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 422c0e4d66b..adc2e2f1b09 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -13,6 +14,8 @@
  */
 #include <linux/init.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -30,9 +33,11 @@
 #include <linux/netfilter/xt_recent.h>
 
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@computergmbh.de>");
 MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_recent");
+MODULE_ALIAS("ip6t_recent");
 
 static unsigned int ip_list_tot = 100;
 static unsigned int ip_pkt_list_tot = 20;
@@ -49,14 +54,15 @@ module_param(ip_list_gid, uint, 0400);
 MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
 MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
 MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
-MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files");
+MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/xt_recent/* files");
 
 struct recent_entry {
 	struct list_head	list;
 	struct list_head	lru_list;
-	__be32			addr;
+	union nf_inet_addr	addr;
+	u_int16_t		family;
 	u_int8_t		ttl;
 	u_int8_t		index;
 	u_int16_t		nstamps;
@@ -67,7 +73,7 @@ struct recent_table {
 	struct list_head	list;
 	char			name[XT_RECENT_NAME_LEN];
 #ifdef CONFIG_PROC_FS
-	struct proc_dir_entry	*proc;
+	struct proc_dir_entry	*proc_old, *proc;
 #endif
 	unsigned int		refcnt;
 	unsigned int		entries;
@@ -80,31 +86,53 @@ static DEFINE_SPINLOCK(recent_lock);
 static DEFINE_MUTEX(recent_mutex);
 
 #ifdef CONFIG_PROC_FS
-static struct proc_dir_entry	*proc_dir;
-static const struct file_operations	recent_fops;
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+static struct proc_dir_entry *proc_old_dir;
+#endif
+static struct proc_dir_entry *recent_proc_dir;
+static const struct file_operations recent_old_fops, recent_mt_fops;
 #endif
 
 static u_int32_t hash_rnd;
-static int hash_rnd_initted;
+static bool hash_rnd_initted;
+
+static unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
+{
+	if (!hash_rnd_initted) {
+		get_random_bytes(&hash_rnd, sizeof(hash_rnd));
+		hash_rnd_initted = true;
+	}
+	return jhash_1word((__force u32)addr->ip, hash_rnd) &
+	       (ip_list_hash_size - 1);
+}
 
-static unsigned int recent_entry_hash(__be32 addr)
+static unsigned int recent_entry_hash6(const union nf_inet_addr *addr)
 {
 	if (!hash_rnd_initted) {
-		get_random_bytes(&hash_rnd, 4);
-		hash_rnd_initted = 1;
+		get_random_bytes(&hash_rnd, sizeof(hash_rnd));
+		hash_rnd_initted = true;
 	}
-	return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1);
+	return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) &
+	       (ip_list_hash_size - 1);
 }
 
 static struct recent_entry *
-recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl)
+recent_entry_lookup(const struct recent_table *table,
+		    const union nf_inet_addr *addrp, u_int16_t family,
+		    u_int8_t ttl)
 {
 	struct recent_entry *e;
 	unsigned int h;
 
-	h = recent_entry_hash(addr);
+	if (family == AF_INET)
+		h = recent_entry_hash4(addrp);
+	else
+		h = recent_entry_hash6(addrp);
+
 	list_for_each_entry(e, &table->iphash[h], list)
-		if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
+		if (e->family == family &&
+		    memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 &&
+		    (ttl == e->ttl || ttl == 0 || e->ttl == 0))
 			return e;
 	return NULL;
 }
@@ -118,7 +146,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
 }
 
 static struct recent_entry *
-recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
+recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr,
+		  u_int16_t family, u_int8_t ttl)
 {
 	struct recent_entry *e;
 
@@ -130,12 +159,16 @@ recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
 		    GFP_ATOMIC);
 	if (e == NULL)
 		return NULL;
-	e->addr      = addr;
+	memcpy(&e->addr, addr, sizeof(e->addr));
 	e->ttl       = ttl;
 	e->stamps[0] = jiffies;
 	e->nstamps   = 1;
 	e->index     = 1;
-	list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
+	e->family    = family;
+	if (family == AF_INET)
+		list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]);
+	else
+		list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]);
 	list_add_tail(&e->lru_list, &t->lru_list);
 	t->entries++;
 	return e;
@@ -179,28 +212,42 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in,
 	const struct xt_recent_mtinfo *info = matchinfo;
 	struct recent_table *t;
 	struct recent_entry *e;
-	__be32 addr;
+	union nf_inet_addr addr = {};
 	u_int8_t ttl;
 	bool ret = info->invert;
 
-	if (info->side == XT_RECENT_DEST)
-		addr = ip_hdr(skb)->daddr;
-	else
-		addr = ip_hdr(skb)->saddr;
+	if (match->family == AF_INET) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (info->side == XT_RECENT_DEST)
+			addr.ip = iph->daddr;
+		else
+			addr.ip = iph->saddr;
+
+		ttl = iph->ttl;
+	} else {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		if (info->side == XT_RECENT_DEST)
+			memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6));
+		else
+			memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6));
+
+		ttl = iph->hop_limit;
+	}
 
-	ttl = ip_hdr(skb)->ttl;
 	/* use TTL as seen before forwarding */
 	if (out && !skb->sk)
 		ttl++;
 
 	spin_lock_bh(&recent_lock);
 	t = recent_table_lookup(info->name);
-	e = recent_entry_lookup(t, addr,
-				info->check_set & XT_RECENT_TTL ? ttl : 0);
+	e = recent_entry_lookup(t, &addr, match->family,
+				(info->check_set & XT_RECENT_TTL) ? ttl : 0);
 	if (e == NULL) {
 		if (!(info->check_set & XT_RECENT_SET))
 			goto out;
-		e = recent_entry_init(t, addr, ttl);
+		e = recent_entry_init(t, &addr, match->family, ttl);
 		if (e == NULL)
 			*hotdrop = true;
 		ret = !ret;
@@ -277,11 +324,24 @@ recent_mt_check(const char *tablename, const void *ip,
 	for (i = 0; i < ip_list_hash_size; i++)
 		INIT_LIST_HEAD(&t->iphash[i]);
 #ifdef CONFIG_PROC_FS
-	t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops);
+	t->proc = proc_create(t->name, ip_list_perms, recent_proc_dir,
+		  &recent_mt_fops);
 	if (t->proc == NULL) {
 		kfree(t);
 		goto out;
 	}
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+	t->proc_old = proc_create(t->name, ip_list_perms, proc_old_dir,
+		      &recent_old_fops);
+	if (t->proc_old == NULL) {
+		remove_proc_entry(t->name, proc_old_dir);
+		kfree(t);
+		goto out;
+	}
+	t->proc_old->uid   = ip_list_uid;
+	t->proc_old->gid   = ip_list_gid;
+	t->proc_old->data  = t;
+#endif
 	t->proc->uid       = ip_list_uid;
 	t->proc->gid       = ip_list_gid;
 	t->proc->data      = t;
@@ -307,7 +367,10 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
 		list_del(&t->list);
 		spin_unlock_bh(&recent_lock);
 #ifdef CONFIG_PROC_FS
-		remove_proc_entry(t->name, proc_dir);
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+		remove_proc_entry(t->name, proc_old_dir);
+#endif
+		remove_proc_entry(t->name, recent_proc_dir);
 #endif
 		recent_table_flush(t);
 		kfree(t);
@@ -317,7 +380,7 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
 
 #ifdef CONFIG_PROC_FS
 struct recent_iter_state {
-	struct recent_table	*table;
+	const struct recent_table *table;
 	unsigned int		bucket;
 };
 
@@ -342,8 +405,8 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct recent_iter_state *st = seq->private;
 	const struct recent_table *t = st->table;
-	struct recent_entry *e = v;
-	struct list_head *head = e->list.next;
+	const struct recent_entry *e = v;
+	const struct list_head *head = e->list.next;
 
 	while (head == &t->iphash[st->bucket]) {
 		if (++st->bucket >= ip_list_hash_size)
@@ -366,8 +429,14 @@ static int recent_seq_show(struct seq_file *seq, void *v)
 	unsigned int i;
 
 	i = (e->index - 1) % ip_pkt_list_tot;
-	seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
-		   NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
+	if (e->family == AF_INET)
+		seq_printf(seq, "src=" NIPQUAD_FMT " ttl: %u last_seen: %lu "
+			   "oldest_pkt: %u", NIPQUAD(e->addr.ip), e->ttl,
+			   e->stamps[i], e->index);
+	else
+		seq_printf(seq, "src=" NIP6_FMT " ttl: %u last_seen: %lu "
+			   "oldest_pkt: %u", NIP6(e->addr.in6), e->ttl,
+			   e->stamps[i], e->index);
 	for (i = 0; i < e->nstamps; i++)
 		seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
 	seq_printf(seq, "\n");
@@ -394,8 +463,22 @@ static int recent_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static ssize_t recent_proc_write(struct file *file, const char __user *input,
-				 size_t size, loff_t *loff)
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+static int recent_old_seq_open(struct inode *inode, struct file *filp)
+{
+	static bool warned_of_old;
+
+	if (unlikely(!warned_of_old)) {
+		printk(KERN_INFO KBUILD_MODNAME ": Use of /proc/net/ipt_recent"
+		       " is deprecated; use /proc/net/xt_recent.\n");
+		warned_of_old = true;
+	}
+	return recent_seq_open(inode, filp);
+}
+
+static ssize_t recent_old_proc_write(struct file *file,
+				     const char __user *input,
+				     size_t size, loff_t *loff)
 {
 	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	struct recent_table *t = pde->data;
@@ -408,6 +491,7 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input,
 		size = sizeof(buf);
 	if (copy_from_user(buf, input, size))
 		return -EFAULT;
+
 	while (isspace(*c))
 		c++;
 
@@ -435,10 +519,10 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input,
 	addr = in_aton(c);
 
 	spin_lock_bh(&recent_lock);
-	e = recent_entry_lookup(t, addr, 0);
+	e = recent_entry_lookup(t, (const void *)&addr, PF_INET, 0);
 	if (e == NULL) {
 		if (add)
-			recent_entry_init(t, addr, 0);
+			recent_entry_init(t, (const void *)&addr, PF_INET, 0);
 	} else {
 		if (add)
 			recent_entry_update(t, e);
@@ -449,23 +533,118 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input,
 	return size;
 }
 
-static const struct file_operations recent_fops = {
-	.open		= recent_seq_open,
+static const struct file_operations recent_old_fops = {
+	.open		= recent_old_seq_open,
 	.read		= seq_read,
-	.write		= recent_proc_write,
+	.write		= recent_old_proc_write,
 	.release	= seq_release_private,
 	.owner		= THIS_MODULE,
 };
+#endif
+
+static ssize_t
+recent_mt_proc_write(struct file *file, const char __user *input,
+		     size_t size, loff_t *loff)
+{
+	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct recent_table *t = pde->data;
+	struct recent_entry *e;
+	char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
+	const char *c = buf;
+	union nf_inet_addr addr;
+	u_int16_t family;
+	bool add, succ;
+
+	if (size == 0)
+		return 0;
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+	if (copy_from_user(buf, input, size) != 0)
+		return -EFAULT;
+
+	/* Strict protocol! */
+	if (*loff != 0)
+		return -ESPIPE;
+	switch (*c) {
+	case '/': /* flush table */
+		spin_lock_bh(&recent_lock);
+		recent_table_flush(t);
+		spin_unlock_bh(&recent_lock);
+		return size;
+	case '-': /* remove address */
+		add = false;
+		break;
+	case '+': /* add address */
+		add = true;
+		break;
+	default:
+		printk(KERN_INFO KBUILD_MODNAME ": Need +ip, -ip or /\n");
+		return -EINVAL;
+	}
+
+	++c;
+	--size;
+	if (strnchr(c, size, ':') != NULL) {
+		family = AF_INET6;
+		succ   = in6_pton(c, size, (void *)&addr, '\n', NULL);
+	} else {
+		family = AF_INET;
+		succ   = in4_pton(c, size, (void *)&addr, '\n', NULL);
+	}
+
+	if (!succ) {
+		printk(KERN_INFO KBUILD_MODNAME ": illegal address written "
+		       "to procfs\n");
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&recent_lock);
+	e = recent_entry_lookup(t, &addr, family, 0);
+	if (e == NULL) {
+		if (add)
+			recent_entry_init(t, &addr, family, 0);
+	} else {
+		if (add)
+			recent_entry_update(t, e);
+		else
+			recent_entry_remove(t, e);
+	}
+	spin_unlock_bh(&recent_lock);
+	/* Note we removed one above */
+	*loff += size + 1;
+	return size + 1;
+}
+
+static const struct file_operations recent_mt_fops = {
+	.open    = recent_seq_open,
+	.read    = seq_read,
+	.write   = recent_mt_proc_write,
+	.release = seq_release_private,
+	.owner   = THIS_MODULE,
+};
 #endif /* CONFIG_PROC_FS */
 
-static struct xt_match recent_mt_reg __read_mostly = {
-	.name		= "recent",
-	.family		= AF_INET,
-	.match		= recent_mt,
-	.matchsize	= sizeof(struct xt_recent_mtinfo),
-	.checkentry	= recent_mt_check,
-	.destroy	= recent_mt_destroy,
-	.me		= THIS_MODULE,
+static struct xt_match recent_mt_reg[] __read_mostly = {
+	{
+		.name       = "recent",
+		.revision   = 0,
+		.family     = AF_INET,
+		.match      = recent_mt,
+		.matchsize  = sizeof(struct xt_recent_mtinfo),
+		.checkentry = recent_mt_check,
+		.destroy    = recent_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "recent",
+		.revision   = 0,
+		.family     = AF_INET6,
+		.match      = recent_mt,
+		.matchsize  = sizeof(struct xt_recent_mtinfo),
+		.checkentry = recent_mt_check,
+		.destroy    = recent_mt_destroy,
+		.me         = THIS_MODULE,
+	},
 };
 
 static int __init recent_mt_init(void)
@@ -476,15 +655,25 @@ static int __init recent_mt_init(void)
 		return -EINVAL;
 	ip_list_hash_size = 1 << fls(ip_list_tot);
 
-	err = xt_register_match(&recent_mt_reg);
+	err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 #ifdef CONFIG_PROC_FS
 	if (err)
 		return err;
-	proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
-	if (proc_dir == NULL) {
-		xt_unregister_match(&recent_mt_reg);
+	recent_proc_dir = proc_mkdir("xt_recent", init_net.proc_net);
+	if (recent_proc_dir == NULL) {
+		xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+		err = -ENOMEM;
+	}
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+	if (err < 0)
+		return err;
+	proc_old_dir = proc_mkdir("ipt_recent", init_net.proc_net);
+	if (proc_old_dir == NULL) {
+		remove_proc_entry("xt_recent", init_net.proc_net);
+		xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 		err = -ENOMEM;
 	}
+#endif
 #endif
 	return err;
 }
@@ -492,9 +681,12 @@ static int __init recent_mt_init(void)
 static void __exit recent_mt_exit(void)
 {
 	BUG_ON(!list_empty(&tables));
-	xt_unregister_match(&recent_mt_reg);
+	xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 #ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
 	remove_proc_entry("ipt_recent", init_net.proc_net);
+#endif
+	remove_proc_entry("xt_recent", init_net.proc_net);
 #endif
 }
 
-- 
cgit v1.2.3


From 7e9c6eeb136a46dfd941852803b3a9dd78939b69 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:00 +0200
Subject: netfilter: Introduce NFPROTO_* constants

The netfilter subsystem only supports a handful of protocols (much
less than PF_*) and even non-PF protocols like ARP and
pseudo-protocols like PF_BRIDGE. By creating NFPROTO_*, we can earn a
few memory savings on arrays that previously were always PF_MAX-sized
and keep the pseudo-protocols to ourselves.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/core.c     |  6 +++---
 net/netfilter/nf_log.c   | 12 ++++++------
 net/netfilter/nf_queue.c | 12 ++++++------
 net/netfilter/x_tables.c | 18 ++++++++++--------
 4 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 26b8f489d7a..b16cd79951c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -26,7 +26,7 @@
 
 static DEFINE_MUTEX(afinfo_mutex);
 
-const struct nf_afinfo *nf_afinfo[NPROTO] __read_mostly;
+const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
 EXPORT_SYMBOL(nf_afinfo);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
@@ -51,7 +51,7 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
 }
 EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
 
-struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS] __read_mostly;
+struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
 EXPORT_SYMBOL(nf_hooks);
 static DEFINE_MUTEX(nf_hook_mutex);
 
@@ -264,7 +264,7 @@ EXPORT_SYMBOL(proc_net_netfilter);
 void __init netfilter_init(void)
 {
 	int i, h;
-	for (i = 0; i < NPROTO; i++) {
+	for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
 		for (h = 0; h < NF_MAX_HOOKS; h++)
 			INIT_LIST_HEAD(&nf_hooks[i][h]);
 	}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 5c2f7332015..fa8ae5d2659 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -15,7 +15,7 @@
 
 #define NF_LOG_PREFIXLEN		128
 
-static const struct nf_logger *nf_loggers[NPROTO] __read_mostly;
+static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
 static DEFINE_MUTEX(nf_log_mutex);
 
 /* return EBUSY if somebody else is registered, EEXIST if the same logger
@@ -24,7 +24,7 @@ int nf_log_register(u_int8_t pf, const struct nf_logger *logger)
 {
 	int ret;
 
-	if (pf >= NPROTO)
+	if (pf >= ARRAY_SIZE(nf_loggers))
 		return -EINVAL;
 
 	/* Any setup of logging members must be done before
@@ -47,7 +47,7 @@ EXPORT_SYMBOL(nf_log_register);
 
 void nf_log_unregister_pf(u_int8_t pf)
 {
-	if (pf >= NPROTO)
+	if (pf >= ARRAY_SIZE(nf_loggers))
 		return;
 	mutex_lock(&nf_log_mutex);
 	rcu_assign_pointer(nf_loggers[pf], NULL);
@@ -63,7 +63,7 @@ void nf_log_unregister(const struct nf_logger *logger)
 	int i;
 
 	mutex_lock(&nf_log_mutex);
-	for (i = 0; i < NPROTO; i++) {
+	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
 		if (nf_loggers[i] == logger)
 			rcu_assign_pointer(nf_loggers[i], NULL);
 	}
@@ -103,7 +103,7 @@ static void *seq_start(struct seq_file *seq, loff_t *pos)
 {
 	rcu_read_lock();
 
-	if (*pos >= NPROTO)
+	if (*pos >= ARRAY_SIZE(nf_loggers))
 		return NULL;
 
 	return pos;
@@ -113,7 +113,7 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	(*pos)++;
 
-	if (*pos >= NPROTO)
+	if (*pos >= ARRAY_SIZE(nf_loggers))
 		return NULL;
 
 	return pos;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index f285086f629..4f2310c93e0 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -16,7 +16,7 @@
  * long term mutex.  The handler must provide an an outfn() to accept packets
  * for queueing and must reinject all packets it receives, no matter what.
  */
-static const struct nf_queue_handler *queue_handler[NPROTO];
+static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
 
 static DEFINE_MUTEX(queue_handler_mutex);
 
@@ -26,7 +26,7 @@ int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
 {
 	int ret;
 
-	if (pf >= NPROTO)
+	if (pf >= ARRAY_SIZE(queue_handler))
 		return -EINVAL;
 
 	mutex_lock(&queue_handler_mutex);
@@ -47,7 +47,7 @@ EXPORT_SYMBOL(nf_register_queue_handler);
 /* The caller must flush their queue before this */
 int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
 {
-	if (pf >= NPROTO)
+	if (pf >= ARRAY_SIZE(queue_handler))
 		return -EINVAL;
 
 	mutex_lock(&queue_handler_mutex);
@@ -70,7 +70,7 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
 	u_int8_t pf;
 
 	mutex_lock(&queue_handler_mutex);
-	for (pf = 0; pf < NPROTO; pf++)  {
+	for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++)  {
 		if (queue_handler[pf] == qh)
 			rcu_assign_pointer(queue_handler[pf], NULL);
 	}
@@ -285,7 +285,7 @@ EXPORT_SYMBOL(nf_reinject);
 #ifdef CONFIG_PROC_FS
 static void *seq_start(struct seq_file *seq, loff_t *pos)
 {
-	if (*pos >= NPROTO)
+	if (*pos >= ARRAY_SIZE(queue_handler))
 		return NULL;
 
 	return pos;
@@ -295,7 +295,7 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	(*pos)++;
 
-	if (*pos >= NPROTO)
+	if (*pos >= ARRAY_SIZE(queue_handler))
 		return NULL;
 
 	return pos;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index cf2f3e90cef..2a7eb1da5d0 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -58,10 +58,12 @@ static struct xt_af *xt;
 #define duprintf(format, args...)
 #endif
 
-static const char *const xt_prefix[NPROTO] = {
-	[AF_INET]	= "ip",
-	[AF_INET6]	= "ip6",
-	[NF_ARP]	= "arp",
+static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
+	[NFPROTO_UNSPEC] = "x",
+	[NFPROTO_IPV4]   = "ip",
+	[NFPROTO_ARP]    = "arp",
+	[NFPROTO_BRIDGE] = "eb",
+	[NFPROTO_IPV6]   = "ip6",
 };
 
 /* Registration hooks for targets. */
@@ -932,7 +934,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
 	struct proc_dir_entry *proc;
 #endif
 
-	if (af >= NPROTO)
+	if (af >= ARRAY_SIZE(xt_prefix))
 		return -EINVAL;
 
 
@@ -1001,7 +1003,7 @@ static int __net_init xt_net_init(struct net *net)
 {
 	int i;
 
-	for (i = 0; i < NPROTO; i++)
+	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		INIT_LIST_HEAD(&net->xt.tables[i]);
 	return 0;
 }
@@ -1014,11 +1016,11 @@ static int __init xt_init(void)
 {
 	int i, rv;
 
-	xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL);
+	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
 		return -ENOMEM;
 
-	for (i = 0; i < NPROTO; i++) {
+	for (i = 0; i < NFPROTO_NUMPROTO; i++) {
 		mutex_init(&xt[i].mutex);
 #ifdef CONFIG_COMPAT
 		mutex_init(&xt[i].compat_mutex);
-- 
cgit v1.2.3


From ee999d8b9573df1b547aacdc6d79f86eb79c25cd Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:01 +0200
Subject: netfilter: x_tables: use NFPROTO_* in extensions

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_log.c       |  6 ++--
 net/bridge/netfilter/ebt_ulog.c      |  2 +-
 net/ipv4/netfilter/arp_tables.c      | 58 +++++++++++++++++++-----------------
 net/ipv4/netfilter/arpt_mangle.c     |  2 +-
 net/ipv4/netfilter/arptable_filter.c |  8 ++---
 net/ipv4/netfilter/ipt_CLUSTERIP.c   |  4 +--
 net/ipv4/netfilter/ipt_ECN.c         |  2 +-
 net/ipv4/netfilter/ipt_LOG.c         |  6 ++--
 net/ipv4/netfilter/ipt_MASQUERADE.c  |  2 +-
 net/ipv4/netfilter/ipt_NETMAP.c      |  2 +-
 net/ipv4/netfilter/ipt_REDIRECT.c    |  2 +-
 net/ipv4/netfilter/ipt_REJECT.c      |  2 +-
 net/ipv4/netfilter/ipt_TTL.c         |  2 +-
 net/ipv4/netfilter/ipt_ULOG.c        |  4 +--
 net/ipv4/netfilter/ipt_addrtype.c    |  4 +--
 net/ipv4/netfilter/ipt_ah.c          |  2 +-
 net/ipv4/netfilter/ipt_ecn.c         |  2 +-
 net/ipv4/netfilter/ipt_ttl.c         |  2 +-
 net/ipv6/netfilter/ip6t_HL.c         |  2 +-
 net/ipv6/netfilter/ip6t_LOG.c        |  7 +++--
 net/ipv6/netfilter/ip6t_REJECT.c     |  2 +-
 net/ipv6/netfilter/ip6t_ah.c         |  2 +-
 net/ipv6/netfilter/ip6t_eui64.c      |  2 +-
 net/ipv6/netfilter/ip6t_frag.c       |  2 +-
 net/ipv6/netfilter/ip6t_hbh.c        |  4 +--
 net/ipv6/netfilter/ip6t_hl.c         |  2 +-
 net/ipv6/netfilter/ip6t_ipv6header.c |  2 +-
 net/ipv6/netfilter/ip6t_mh.c         |  2 +-
 net/ipv6/netfilter/ip6t_rt.c         |  2 +-
 net/netfilter/xt_CLASSIFY.c          |  4 +--
 net/netfilter/xt_CONNMARK.c          |  8 ++---
 net/netfilter/xt_CONNSECMARK.c       |  4 +--
 net/netfilter/xt_DSCP.c              | 10 +++----
 net/netfilter/xt_MARK.c              | 12 ++++----
 net/netfilter/xt_NFLOG.c             |  4 +--
 net/netfilter/xt_NFQUEUE.c           |  4 +--
 net/netfilter/xt_NOTRACK.c           |  4 +--
 net/netfilter/xt_RATEEST.c           |  4 +--
 net/netfilter/xt_SECMARK.c           |  4 +--
 net/netfilter/xt_TCPMSS.c            |  4 +--
 net/netfilter/xt_TCPOPTSTRIP.c       |  4 +--
 net/netfilter/xt_TRACE.c             |  4 +--
 net/netfilter/xt_comment.c           |  4 +--
 net/netfilter/xt_connbytes.c         |  4 +--
 net/netfilter/xt_connlimit.c         | 10 +++----
 net/netfilter/xt_connmark.c          |  8 ++---
 net/netfilter/xt_conntrack.c         | 10 +++----
 net/netfilter/xt_dccp.c              |  4 +--
 net/netfilter/xt_dscp.c              | 12 ++++----
 net/netfilter/xt_esp.c               |  4 +--
 net/netfilter/xt_hashlimit.c         | 40 ++++++++++++-------------
 net/netfilter/xt_helper.c            |  4 +--
 net/netfilter/xt_iprange.c           |  6 ++--
 net/netfilter/xt_length.c            |  4 +--
 net/netfilter/xt_limit.c             |  4 +--
 net/netfilter/xt_mac.c               |  4 +--
 net/netfilter/xt_mark.c              |  8 ++---
 net/netfilter/xt_multiport.c         |  8 ++---
 net/netfilter/xt_owner.c             |  8 ++---
 net/netfilter/xt_physdev.c           |  4 +--
 net/netfilter/xt_pkttype.c           |  8 ++---
 net/netfilter/xt_policy.c            |  8 ++---
 net/netfilter/xt_quota.c             |  4 +--
 net/netfilter/xt_rateest.c           |  4 +--
 net/netfilter/xt_realm.c             |  2 +-
 net/netfilter/xt_recent.c            | 21 ++++++-------
 net/netfilter/xt_sctp.c              |  4 +--
 net/netfilter/xt_state.c             |  4 +--
 net/netfilter/xt_statistic.c         |  4 +--
 net/netfilter/xt_string.c            |  8 ++---
 net/netfilter/xt_tcpmss.c            |  4 +--
 net/netfilter/xt_tcpudp.c            | 12 ++++----
 net/netfilter/xt_time.c              |  4 +--
 net/netfilter/xt_u32.c               |  4 +--
 74 files changed, 225 insertions(+), 223 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 3770cd8a7b3..8b17c64bcd7 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -206,10 +206,10 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 	li.u.log.logflags = info->bitmask;
 
 	if (info->bitmask & EBT_LOG_NFLOG)
-		nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li,
+		nf_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li,
 			      "%s", info->prefix);
 	else
-		ebt_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li,
+		ebt_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li,
 			       info->prefix);
 }
 
@@ -234,7 +234,7 @@ static int __init ebt_log_init(void)
 	ret = ebt_register_watcher(&log);
 	if (ret < 0)
 		return ret;
-	nf_log_register(PF_BRIDGE, &ebt_log_logger);
+	nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger);
 	return 0;
 }
 
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index c84bda61afb..3b1678cd66f 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -310,7 +310,7 @@ static int __init ebt_ulog_init(void)
 		netlink_kernel_release(ebtulognl);
 
 	if (ret == 0)
-		nf_log_register(PF_BRIDGE, &ebt_ulog_logger);
+		nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
 
 	return ret;
 }
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 03e83a65aec..b4a9a1799c9 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -463,7 +463,8 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 	t = arpt_get_target(e);
 	target = t->u.kernel.target;
 
-	ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t),
+	ret = xt_check_target(target, NFPROTO_ARP,
+			      t->u.target_size - sizeof(*t),
 			      name, e->comefrom, 0, 0);
 	if (!ret && t->u.kernel.target->checkentry
 	    && !t->u.kernel.target->checkentry(name, e, target, t->data,
@@ -488,7 +489,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
 		return ret;
 
 	t = arpt_get_target(e);
-	target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name,
+	target = try_then_request_module(xt_find_target(NFPROTO_ARP,
+							t->u.user.name,
 							t->u.user.revision),
 					 "arpt_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
@@ -788,7 +790,7 @@ static void compat_standard_from_user(void *dst, void *src)
 	int v = *(compat_int_t *)src;
 
 	if (v > 0)
-		v += xt_compat_calc_jump(NF_ARP, v);
+		v += xt_compat_calc_jump(NFPROTO_ARP, v);
 	memcpy(dst, &v, sizeof(v));
 }
 
@@ -797,7 +799,7 @@ static int compat_standard_to_user(void __user *dst, void *src)
 	compat_int_t cv = *(int *)src;
 
 	if (cv > 0)
-		cv -= xt_compat_calc_jump(NF_ARP, cv);
+		cv -= xt_compat_calc_jump(NFPROTO_ARP, cv);
 	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
 }
 
@@ -815,7 +817,7 @@ static int compat_calc_entry(struct arpt_entry *e,
 	t = arpt_get_target(e);
 	off += xt_compat_target_offset(t->u.kernel.target);
 	newinfo->size -= off;
-	ret = xt_compat_add_offset(NF_ARP, entry_offset, off);
+	ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
 	if (ret)
 		return ret;
 
@@ -866,9 +868,9 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
 	name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
 	if (compat)
-		xt_compat_lock(NF_ARP);
+		xt_compat_lock(NFPROTO_ARP);
 #endif
-	t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name),
+	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
 				    "arptable_%s", name);
 	if (t && !IS_ERR(t)) {
 		struct arpt_getinfo info;
@@ -878,7 +880,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
 		if (compat) {
 			struct xt_table_info tmp;
 			ret = compat_table_info(private, &tmp);
-			xt_compat_flush_offsets(NF_ARP);
+			xt_compat_flush_offsets(NFPROTO_ARP);
 			private = &tmp;
 		}
 #endif
@@ -901,7 +903,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
 		ret = t ? PTR_ERR(t) : -ENOENT;
 #ifdef CONFIG_COMPAT
 	if (compat)
-		xt_compat_unlock(NF_ARP);
+		xt_compat_unlock(NFPROTO_ARP);
 #endif
 	return ret;
 }
@@ -925,7 +927,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
 		return -EINVAL;
 	}
 
-	t = xt_find_table_lock(net, NF_ARP, get.name);
+	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
 	if (t && !IS_ERR(t)) {
 		const struct xt_table_info *private = t->private;
 
@@ -967,7 +969,7 @@ static int __do_replace(struct net *net, const char *name,
 		goto out;
 	}
 
-	t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name),
+	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
 				    "arptable_%s", name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1134,7 +1136,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	t = xt_find_table_lock(net, NF_ARP, name);
+	t = xt_find_table_lock(net, NFPROTO_ARP, name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
@@ -1218,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	entry_offset = (void *)e - (void *)base;
 
 	t = compat_arpt_get_target(e);
-	target = try_then_request_module(xt_find_target(NF_ARP,
+	target = try_then_request_module(xt_find_target(NFPROTO_ARP,
 							t->u.user.name,
 							t->u.user.revision),
 					 "arpt_%s", t->u.user.name);
@@ -1232,7 +1234,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 
 	off += xt_compat_target_offset(target);
 	*size += off;
-	ret = xt_compat_add_offset(NF_ARP, entry_offset, off);
+	ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
 	if (ret)
 		goto release_target;
 
@@ -1333,7 +1335,7 @@ static int translate_compat_table(const char *name,
 
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
-	xt_compat_lock(NF_ARP);
+	xt_compat_lock(NFPROTO_ARP);
 	/* Walk through entries, checking offsets. */
 	ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
 					check_compat_entry_size_and_hooks,
@@ -1383,8 +1385,8 @@ static int translate_compat_table(const char *name,
 	ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
 					compat_copy_entry_from_user,
 					&pos, &size, name, newinfo, entry1);
-	xt_compat_flush_offsets(NF_ARP);
-	xt_compat_unlock(NF_ARP);
+	xt_compat_flush_offsets(NFPROTO_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
 	if (ret)
 		goto free_newinfo;
 
@@ -1420,8 +1422,8 @@ out:
 	COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j);
 	return ret;
 out_unlock:
-	xt_compat_flush_offsets(NF_ARP);
-	xt_compat_unlock(NF_ARP);
+	xt_compat_flush_offsets(NFPROTO_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
 	goto out;
 }
 
@@ -1607,8 +1609,8 @@ static int compat_get_entries(struct net *net,
 		return -EINVAL;
 	}
 
-	xt_compat_lock(NF_ARP);
-	t = xt_find_table_lock(net, NF_ARP, get.name);
+	xt_compat_lock(NFPROTO_ARP);
+	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
 	if (t && !IS_ERR(t)) {
 		const struct xt_table_info *private = t->private;
 		struct xt_table_info info;
@@ -1623,13 +1625,13 @@ static int compat_get_entries(struct net *net,
 				 private->size, get.size);
 			ret = -EAGAIN;
 		}
-		xt_compat_flush_offsets(NF_ARP);
+		xt_compat_flush_offsets(NFPROTO_ARP);
 		module_put(t->me);
 		xt_table_unlock(t);
 	} else
 		ret = t ? PTR_ERR(t) : -ENOENT;
 
-	xt_compat_unlock(NF_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
 	return ret;
 }
 
@@ -1709,7 +1711,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 			break;
 		}
 
-		try_then_request_module(xt_find_revision(NF_ARP, rev.name,
+		try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
 							 rev.revision, 1, &ret),
 					"arpt_%s", rev.name);
 		break;
@@ -1787,7 +1789,7 @@ void arpt_unregister_table(struct xt_table *table)
 static struct xt_target arpt_standard_target __read_mostly = {
 	.name		= ARPT_STANDARD_TARGET,
 	.targetsize	= sizeof(int),
-	.family		= NF_ARP,
+	.family		= NFPROTO_ARP,
 #ifdef CONFIG_COMPAT
 	.compatsize	= sizeof(compat_int_t),
 	.compat_from_user = compat_standard_from_user,
@@ -1799,7 +1801,7 @@ static struct xt_target arpt_error_target __read_mostly = {
 	.name		= ARPT_ERROR_TARGET,
 	.target		= arpt_error,
 	.targetsize	= ARPT_FUNCTION_MAXNAMELEN,
-	.family		= NF_ARP,
+	.family		= NFPROTO_ARP,
 };
 
 static struct nf_sockopt_ops arpt_sockopts = {
@@ -1821,12 +1823,12 @@ static struct nf_sockopt_ops arpt_sockopts = {
 
 static int __net_init arp_tables_net_init(struct net *net)
 {
-	return xt_proto_init(net, NF_ARP);
+	return xt_proto_init(net, NFPROTO_ARP);
 }
 
 static void __net_exit arp_tables_net_exit(struct net *net)
 {
-	xt_proto_fini(net, NF_ARP);
+	xt_proto_fini(net, NFPROTO_ARP);
 }
 
 static struct pernet_operations arp_tables_net_ops = {
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index a385959d265..3f9e4ccd616 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -75,7 +75,7 @@ checkentry(const char *tablename, const void *e, const struct xt_target *target,
 
 static struct xt_target arpt_mangle_reg __read_mostly = {
 	.name		= "mangle",
-	.family		= NF_ARP,
+	.family		= NFPROTO_ARP,
 	.target		= target,
 	.targetsize	= sizeof(struct arpt_mangle),
 	.checkentry	= checkentry,
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 082f5dd3156..bee3d117661 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -51,7 +51,7 @@ static struct xt_table packet_filter = {
 	.lock		= __RW_LOCK_UNLOCKED(packet_filter.lock),
 	.private	= NULL,
 	.me		= THIS_MODULE,
-	.af		= NF_ARP,
+	.af		= NFPROTO_ARP,
 };
 
 /* The work comes in here from netfilter.c */
@@ -89,21 +89,21 @@ static struct nf_hook_ops arpt_ops[] __read_mostly = {
 	{
 		.hook		= arpt_in_hook,
 		.owner		= THIS_MODULE,
-		.pf		= NF_ARP,
+		.pf		= NFPROTO_ARP,
 		.hooknum	= NF_ARP_IN,
 		.priority	= NF_IP_PRI_FILTER,
 	},
 	{
 		.hook		= arpt_out_hook,
 		.owner		= THIS_MODULE,
-		.pf		= NF_ARP,
+		.pf		= NFPROTO_ARP,
 		.hooknum	= NF_ARP_OUT,
 		.priority	= NF_IP_PRI_FILTER,
 	},
 	{
 		.hook		= arpt_forward_hook,
 		.owner		= THIS_MODULE,
-		.pf		= NF_ARP,
+		.pf		= NFPROTO_ARP,
 		.hooknum	= NF_ARP_FORWARD,
 		.priority	= NF_IP_PRI_FILTER,
 	},
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index fafe8ebb4c5..63faddc18a1 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -445,7 +445,7 @@ struct compat_ipt_clusterip_tgt_info
 
 static struct xt_target clusterip_tg_reg __read_mostly = {
 	.name		= "CLUSTERIP",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target		= clusterip_tg,
 	.checkentry	= clusterip_tg_check,
 	.destroy	= clusterip_tg_destroy,
@@ -546,7 +546,7 @@ arp_mangle(unsigned int hook,
 
 static struct nf_hook_ops cip_arp_ops __read_mostly = {
 	.hook = arp_mangle,
-	.pf = NF_ARP,
+	.pf = NFPROTO_ARP,
 	.hooknum = NF_ARP_OUT,
 	.priority = -1
 };
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index d60139c134c..aee2364afff 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -124,7 +124,7 @@ ecn_tg_check(const char *tablename, const void *e_void,
 
 static struct xt_target ecn_tg_reg __read_mostly = {
 	.name		= "ECN",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target		= ecn_tg,
 	.targetsize	= sizeof(struct ipt_ECN_info),
 	.table		= "mangle",
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 9330ba3577e..1c9785df4df 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -437,7 +437,7 @@ log_tg(struct sk_buff *skb, const struct net_device *in,
 	li.u.log.level = loginfo->level;
 	li.u.log.logflags = loginfo->logflags;
 
-	ipt_log_packet(PF_INET, hooknum, skb, in, out, &li,
+	ipt_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, &li,
 		       loginfo->prefix);
 	return XT_CONTINUE;
 }
@@ -463,7 +463,7 @@ log_tg_check(const char *tablename, const void *e,
 
 static struct xt_target log_tg_reg __read_mostly = {
 	.name		= "LOG",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target		= log_tg,
 	.targetsize	= sizeof(struct ipt_log_info),
 	.checkentry	= log_tg_check,
@@ -483,7 +483,7 @@ static int __init log_tg_init(void)
 	ret = xt_register_target(&log_tg_reg);
 	if (ret < 0)
 		return ret;
-	nf_log_register(PF_INET, &ipt_log_logger);
+	nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 0841aefaa50..9a4822f8243 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -153,7 +153,7 @@ static struct notifier_block masq_inet_notifier = {
 
 static struct xt_target masquerade_tg_reg __read_mostly = {
 	.name		= "MASQUERADE",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target		= masquerade_tg,
 	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
 	.table		= "nat",
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 6739abfd152..f281500bd7f 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -75,7 +75,7 @@ netmap_tg(struct sk_buff *skb, const struct net_device *in,
 
 static struct xt_target netmap_tg_reg __read_mostly = {
 	.name 		= "NETMAP",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target 	= netmap_tg,
 	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
 	.table		= "nat",
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 5c6292449d1..ef496105eae 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -92,7 +92,7 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in,
 
 static struct xt_target redirect_tg_reg __read_mostly = {
 	.name		= "REDIRECT",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target		= redirect_tg,
 	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
 	.table		= "nat",
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 2639872849d..9f5da0c2cae 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -201,7 +201,7 @@ reject_tg_check(const char *tablename, const void *e_void,
 
 static struct xt_target reject_tg_reg __read_mostly = {
 	.name		= "REJECT",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target		= reject_tg,
 	.targetsize	= sizeof(struct ipt_reject_info),
 	.table		= "filter",
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index 30eed65e733..7d01d424a71 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -80,7 +80,7 @@ ttl_tg_check(const char *tablename, const void *e,
 
 static struct xt_target ttl_tg_reg __read_mostly = {
 	.name 		= "TTL",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target 	= ttl_tg,
 	.targetsize	= sizeof(struct ipt_TTL_info),
 	.table		= "mangle",
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index d8241e6b077..9065e4a34fb 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -374,7 +374,7 @@ static int ulog_tg_compat_to_user(void __user *dst, void *src)
 
 static struct xt_target ulog_tg_reg __read_mostly = {
 	.name		= "ULOG",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.target		= ulog_tg,
 	.targetsize	= sizeof(struct ipt_ulog_info),
 	.checkentry	= ulog_tg_check,
@@ -419,7 +419,7 @@ static int __init ulog_tg_init(void)
 		return ret;
 	}
 	if (nflog)
-		nf_log_register(PF_INET, &ipt_ulog_logger);
+		nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
 
 	return 0;
 }
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 462a22c9787..2c9d88a6c83 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -108,14 +108,14 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void,
 static struct xt_match addrtype_mt_reg[] __read_mostly = {
 	{
 		.name		= "addrtype",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= addrtype_mt_v0,
 		.matchsize	= sizeof(struct ipt_addrtype_info),
 		.me		= THIS_MODULE
 	},
 	{
 		.name		= "addrtype",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.revision	= 1,
 		.match		= addrtype_mt_v1,
 		.checkentry	= addrtype_mt_checkentry_v1,
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index e977989629c..e2e993edd66 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -83,7 +83,7 @@ ah_mt_check(const char *tablename, const void *ip_void,
 
 static struct xt_match ah_mt_reg __read_mostly = {
 	.name		= "ah",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.match		= ah_mt,
 	.matchsize	= sizeof(struct ipt_ah),
 	.proto		= IPPROTO_AH,
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 749de8284ce..2c45b4be7c3 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -114,7 +114,7 @@ ecn_mt_check(const char *tablename, const void *ip_void,
 
 static struct xt_match ecn_mt_reg __read_mostly = {
 	.name		= "ecn",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.match		= ecn_mt,
 	.matchsize	= sizeof(struct ipt_ecn_info),
 	.checkentry	= ecn_mt_check,
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index e0b8caeb710..d4c3fdc2a79 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -46,7 +46,7 @@ ttl_mt(const struct sk_buff *skb, const struct net_device *in,
 
 static struct xt_match ttl_mt_reg __read_mostly = {
 	.name		= "ttl",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.match		= ttl_mt,
 	.matchsize	= sizeof(struct ipt_ttl_info),
 	.me		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
index d5f8fd5f29d..7eebd350916 100644
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -78,7 +78,7 @@ hl_tg6_check(const char *tablename, const void *entry,
 
 static struct xt_target hl_tg6_reg __read_mostly = {
 	.name 		= "HL",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.target		= hl_tg6,
 	.targetsize	= sizeof(struct ip6t_HL_info),
 	.table		= "mangle",
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0716f8afa0b..fd148f3d842 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -449,7 +449,8 @@ log_tg6(struct sk_buff *skb, const struct net_device *in,
 	li.u.log.level = loginfo->level;
 	li.u.log.logflags = loginfo->logflags;
 
-	ip6t_log_packet(PF_INET6, hooknum, skb, in, out, &li, loginfo->prefix);
+	ip6t_log_packet(NFPROTO_IPV6, hooknum, skb, in, out,
+			&li, loginfo->prefix);
 	return XT_CONTINUE;
 }
 
@@ -475,7 +476,7 @@ log_tg6_check(const char *tablename, const void *entry,
 
 static struct xt_target log_tg6_reg __read_mostly = {
 	.name 		= "LOG",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.target 	= log_tg6,
 	.targetsize	= sizeof(struct ip6t_log_info),
 	.checkentry	= log_tg6_check,
@@ -495,7 +496,7 @@ static int __init log_tg6_init(void)
 	ret = xt_register_target(&log_tg6_reg);
 	if (ret < 0)
 		return ret;
-	nf_log_register(PF_INET6, &ip6t_logger);
+	nf_log_register(NFPROTO_IPV6, &ip6t_logger);
 	return 0;
 }
 
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 44c8d65a243..672ad9ff3e2 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -237,7 +237,7 @@ reject_tg6_check(const char *tablename, const void *entry,
 
 static struct xt_target reject_tg6_reg __read_mostly = {
 	.name		= "REJECT",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.target		= reject_tg6,
 	.targetsize	= sizeof(struct ip6t_reject_info),
 	.table		= "filter",
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 429629fd63b..061f89beeb6 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -110,7 +110,7 @@ ah_mt6_check(const char *tablename, const void *entry,
 
 static struct xt_match ah_mt6_reg __read_mostly = {
 	.name		= "ah",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.match		= ah_mt6,
 	.matchsize	= sizeof(struct ip6t_ah),
 	.checkentry	= ah_mt6_check,
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index 8f331f12b2e..ba38df1116f 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -60,7 +60,7 @@ eui64_mt6(const struct sk_buff *skb, const struct net_device *in,
 
 static struct xt_match eui64_mt6_reg __read_mostly = {
 	.name		= "eui64",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.match		= eui64_mt6,
 	.matchsize	= sizeof(int),
 	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index e2bbc63dba5..972f699af22 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -127,7 +127,7 @@ frag_mt6_check(const char *tablename, const void *ip,
 
 static struct xt_match frag_mt6_reg __read_mostly = {
 	.name		= "frag",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.match		= frag_mt6,
 	.matchsize	= sizeof(struct ip6t_frag),
 	.checkentry	= frag_mt6_check,
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 26654b26d7f..d5edb51a595 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -187,7 +187,7 @@ hbh_mt6_check(const char *tablename, const void *entry,
 static struct xt_match hbh_mt6_reg[] __read_mostly = {
 	{
 		.name		= "hbh",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= hbh_mt6,
 		.matchsize	= sizeof(struct ip6t_opts),
 		.checkentry	= hbh_mt6_check,
@@ -196,7 +196,7 @@ static struct xt_match hbh_mt6_reg[] __read_mostly = {
 	},
 	{
 		.name		= "dst",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= hbh_mt6,
 		.matchsize	= sizeof(struct ip6t_opts),
 		.checkentry	= hbh_mt6_check,
diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c
index 34567167384..25c1eb92fac 100644
--- a/net/ipv6/netfilter/ip6t_hl.c
+++ b/net/ipv6/netfilter/ip6t_hl.c
@@ -51,7 +51,7 @@ hl_mt6(const struct sk_buff *skb, const struct net_device *in,
 
 static struct xt_match hl_mt6_reg __read_mostly = {
 	.name		= "hl",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.match		= hl_mt6,
 	.matchsize	= sizeof(struct ip6t_hl_info),
 	.me		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 317a8960a75..ef0661aacea 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -138,7 +138,7 @@ ipv6header_mt6_check(const char *tablename, const void *ip,
 
 static struct xt_match ipv6header_mt6_reg __read_mostly = {
 	.name		= "ipv6header",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.match		= ipv6header_mt6,
 	.matchsize	= sizeof(struct ip6t_ipv6header_info),
 	.checkentry	= ipv6header_mt6_check,
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
index e06678d07ec..dd876274ff6 100644
--- a/net/ipv6/netfilter/ip6t_mh.c
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -84,7 +84,7 @@ mh_mt6_check(const char *tablename, const void *entry,
 
 static struct xt_match mh_mt6_reg __read_mostly = {
 	.name		= "mh",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.checkentry	= mh_mt6_check,
 	.match		= mh_mt6,
 	.matchsize	= sizeof(struct ip6t_mh),
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 81aaf7aaaab..7c544ae591d 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -214,7 +214,7 @@ rt_mt6_check(const char *tablename, const void *entry,
 
 static struct xt_match rt_mt6_reg __read_mostly = {
 	.name		= "rt",
-	.family		= AF_INET6,
+	.family		= NFPROTO_IPV6,
 	.match		= rt_mt6,
 	.matchsize	= sizeof(struct ip6t_rt),
 	.checkentry	= rt_mt6_check,
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index 77a52bf8322..9d68da1748b 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -39,7 +39,7 @@ classify_tg(struct sk_buff *skb, const struct net_device *in,
 
 static struct xt_target classify_tg_reg[] __read_mostly = {
 	{
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.name 		= "CLASSIFY",
 		.target 	= classify_tg,
 		.targetsize	= sizeof(struct xt_classify_target_info),
@@ -51,7 +51,7 @@ static struct xt_target classify_tg_reg[] __read_mostly = {
 	},
 	{
 		.name 		= "CLASSIFY",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.target 	= classify_tg,
 		.targetsize	= sizeof(struct xt_classify_target_info),
 		.table		= "mangle",
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index 5fecfb4794b..e72e5d01752 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -197,7 +197,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
 	{
 		.name		= "CONNMARK",
 		.revision	= 0,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= connmark_tg_check_v0,
 		.destroy	= connmark_tg_destroy,
 		.target		= connmark_tg_v0,
@@ -212,7 +212,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
 	{
 		.name		= "CONNMARK",
 		.revision	= 0,
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= connmark_tg_check_v0,
 		.destroy	= connmark_tg_destroy,
 		.target		= connmark_tg_v0,
@@ -227,7 +227,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
 	{
 		.name           = "CONNMARK",
 		.revision       = 1,
-		.family         = AF_INET,
+		.family         = NFPROTO_IPV4,
 		.checkentry     = connmark_tg_check,
 		.target         = connmark_tg,
 		.targetsize     = sizeof(struct xt_connmark_tginfo1),
@@ -237,7 +237,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
 	{
 		.name           = "CONNMARK",
 		.revision       = 1,
-		.family         = AF_INET6,
+		.family         = NFPROTO_IPV6,
 		.checkentry     = connmark_tg_check,
 		.target         = connmark_tg,
 		.targetsize     = sizeof(struct xt_connmark_tginfo1),
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 76ca1f2421e..ae939e54dfa 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -127,7 +127,7 @@ connsecmark_tg_destroy(const struct xt_target *target, void *targinfo)
 static struct xt_target connsecmark_tg_reg[] __read_mostly = {
 	{
 		.name		= "CONNSECMARK",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= connsecmark_tg_check,
 		.destroy	= connsecmark_tg_destroy,
 		.target		= connsecmark_tg,
@@ -136,7 +136,7 @@ static struct xt_target connsecmark_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "CONNSECMARK",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= connsecmark_tg_check,
 		.destroy	= connsecmark_tg_destroy,
 		.target		= connsecmark_tg,
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 97efd74c04f..f0b4958528e 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -165,7 +165,7 @@ tos_tg6(struct sk_buff *skb, const struct net_device *in,
 static struct xt_target dscp_tg_reg[] __read_mostly = {
 	{
 		.name		= "DSCP",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= dscp_tg_check,
 		.target		= dscp_tg,
 		.targetsize	= sizeof(struct xt_DSCP_info),
@@ -174,7 +174,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "DSCP",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= dscp_tg_check,
 		.target		= dscp_tg6,
 		.targetsize	= sizeof(struct xt_DSCP_info),
@@ -184,7 +184,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = {
 	{
 		.name		= "TOS",
 		.revision	= 0,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.table		= "mangle",
 		.target		= tos_tg_v0,
 		.targetsize	= sizeof(struct ipt_tos_target_info),
@@ -194,7 +194,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = {
 	{
 		.name		= "TOS",
 		.revision	= 1,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.table		= "mangle",
 		.target		= tos_tg,
 		.targetsize	= sizeof(struct xt_tos_target_info),
@@ -203,7 +203,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = {
 	{
 		.name		= "TOS",
 		.revision	= 1,
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.table		= "mangle",
 		.target		= tos_tg6,
 		.targetsize	= sizeof(struct xt_tos_target_info),
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index f9ce20b5898..55ef0796c76 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -161,7 +161,7 @@ static int mark_tg_compat_to_user_v1(void __user *dst, void *src)
 static struct xt_target mark_tg_reg[] __read_mostly = {
 	{
 		.name		= "MARK",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.revision	= 0,
 		.checkentry	= mark_tg_check_v0,
 		.target		= mark_tg_v0,
@@ -176,7 +176,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "MARK",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.revision	= 1,
 		.checkentry	= mark_tg_check_v1,
 		.target		= mark_tg_v1,
@@ -191,7 +191,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "MARK",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.revision	= 0,
 		.checkentry	= mark_tg_check_v0,
 		.target		= mark_tg_v0,
@@ -206,7 +206,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "MARK",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.revision	= 1,
 		.checkentry	= mark_tg_check_v1,
 		.target		= mark_tg_v1,
@@ -222,7 +222,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
 	{
 		.name           = "MARK",
 		.revision       = 2,
-		.family         = AF_INET,
+		.family         = NFPROTO_IPV4,
 		.target         = mark_tg,
 		.targetsize     = sizeof(struct xt_mark_tginfo2),
 		.me             = THIS_MODULE,
@@ -230,7 +230,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
 	{
 		.name           = "MARK",
 		.revision       = 2,
-		.family         = AF_INET6,
+		.family         = NFPROTO_IPV6,
 		.target         = mark_tg,
 		.targetsize     = sizeof(struct xt_mark_tginfo2),
 		.me             = THIS_MODULE,
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 19ae8efae65..9b095520176 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -55,7 +55,7 @@ nflog_tg_check(const char *tablename, const void *entry,
 static struct xt_target nflog_tg_reg[] __read_mostly = {
 	{
 		.name		= "NFLOG",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= nflog_tg_check,
 		.target		= nflog_tg,
 		.targetsize	= sizeof(struct xt_nflog_info),
@@ -63,7 +63,7 @@ static struct xt_target nflog_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "NFLOG",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= nflog_tg_check,
 		.target		= nflog_tg,
 		.targetsize	= sizeof(struct xt_nflog_info),
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index beb24d19a56..c03c2e8d06f 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -36,14 +36,14 @@ nfqueue_tg(struct sk_buff *skb, const struct net_device *in,
 static struct xt_target nfqueue_tg_reg[] __read_mostly = {
 	{
 		.name		= "NFQUEUE",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.target		= nfqueue_tg,
 		.targetsize	= sizeof(struct xt_NFQ_info),
 		.me		= THIS_MODULE,
 	},
 	{
 		.name		= "NFQUEUE",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.target		= nfqueue_tg,
 		.targetsize	= sizeof(struct xt_NFQ_info),
 		.me		= THIS_MODULE,
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index 6c9de611eb8..b9ee268b37c 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -35,14 +35,14 @@ notrack_tg(struct sk_buff *skb, const struct net_device *in,
 static struct xt_target notrack_tg_reg[] __read_mostly = {
 	{
 		.name		= "NOTRACK",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.target		= notrack_tg,
 		.table		= "raw",
 		.me		= THIS_MODULE,
 	},
 	{
 		.name		= "NOTRACK",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.target		= notrack_tg,
 		.table		= "raw",
 		.me		= THIS_MODULE,
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 64d6ad38029..f7114fc5cfc 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -159,7 +159,7 @@ static void xt_rateest_tg_destroy(const struct xt_target *target,
 
 static struct xt_target xt_rateest_target[] __read_mostly = {
 	{
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.name		= "RATEEST",
 		.target		= xt_rateest_tg,
 		.checkentry	= xt_rateest_tg_checkentry,
@@ -168,7 +168,7 @@ static struct xt_target xt_rateest_target[] __read_mostly = {
 		.me		= THIS_MODULE,
 	},
 	{
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.name		= "RATEEST",
 		.target		= xt_rateest_tg,
 		.checkentry	= xt_rateest_tg_checkentry,
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 94f87ee7552..8f8f57b93a6 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -128,7 +128,7 @@ static void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
 static struct xt_target secmark_tg_reg[] __read_mostly = {
 	{
 		.name		= "SECMARK",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= secmark_tg_check,
 		.destroy	= secmark_tg_destroy,
 		.target		= secmark_tg,
@@ -137,7 +137,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "SECMARK",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= secmark_tg_check,
 		.destroy	= secmark_tg_destroy,
 		.target		= secmark_tg,
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index beb5094703c..b868f995239 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -289,7 +289,7 @@ tcpmss_tg6_check(const char *tablename, const void *entry,
 
 static struct xt_target tcpmss_tg_reg[] __read_mostly = {
 	{
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.name		= "TCPMSS",
 		.checkentry	= tcpmss_tg4_check,
 		.target		= tcpmss_tg4,
@@ -299,7 +299,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
 	},
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	{
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.name		= "TCPMSS",
 		.checkentry	= tcpmss_tg6_check,
 		.target		= tcpmss_tg6,
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 9685b6fcbc8..2e0ae6cc5d9 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -106,7 +106,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in,
 static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
 	{
 		.name       = "TCPOPTSTRIP",
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.table      = "mangle",
 		.proto      = IPPROTO_TCP,
 		.target     = tcpoptstrip_tg4,
@@ -116,7 +116,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
 #if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE)
 	{
 		.name       = "TCPOPTSTRIP",
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.table      = "mangle",
 		.proto      = IPPROTO_TCP,
 		.target     = tcpoptstrip_tg6,
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index 30dab79a343..e1bcad57914 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -22,14 +22,14 @@ trace_tg(struct sk_buff *skb, const struct net_device *in,
 static struct xt_target trace_tg_reg[] __read_mostly = {
 	{
 		.name		= "TRACE",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.target		= trace_tg,
 		.table		= "raw",
 		.me		= THIS_MODULE,
 	},
 	{
 		.name		= "TRACE",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.target		= trace_tg,
 		.table		= "raw",
 		.me		= THIS_MODULE,
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
index 89f47364e84..fa211b2ab87 100644
--- a/net/netfilter/xt_comment.c
+++ b/net/netfilter/xt_comment.c
@@ -28,14 +28,14 @@ comment_mt(const struct sk_buff *skb, const struct net_device *in,
 static struct xt_match comment_mt_reg[] __read_mostly = {
 	{
 		.name		= "comment",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= comment_mt,
 		.matchsize	= sizeof(struct xt_comment_info),
 		.me		= THIS_MODULE
 	},
 	{
 		.name		= "comment",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= comment_mt,
 		.matchsize	= sizeof(struct xt_comment_info),
 		.me		= THIS_MODULE
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 3e39c4fe193..d2cd22a49c9 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -130,7 +130,7 @@ connbytes_mt_destroy(const struct xt_match *match, void *matchinfo)
 static struct xt_match connbytes_mt_reg[] __read_mostly = {
 	{
 		.name		= "connbytes",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= connbytes_mt_check,
 		.match		= connbytes_mt,
 		.destroy	= connbytes_mt_destroy,
@@ -139,7 +139,7 @@ static struct xt_match connbytes_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "connbytes",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= connbytes_mt_check,
 		.match		= connbytes_mt,
 		.destroy	= connbytes_mt_destroy,
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 1655e2cf25c..d2453d182d6 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -84,7 +84,7 @@ same_source_net(const union nf_inet_addr *addr,
 		const union nf_inet_addr *mask,
 		const union nf_inet_addr *u3, u_int8_t family)
 {
-	if (family == AF_INET) {
+	if (family == NFPROTO_IPV4) {
 		return (addr->ip & mask->ip) == (u3->ip & mask->ip);
 	} else {
 		union nf_inet_addr lh, rh;
@@ -114,7 +114,7 @@ static int count_them(struct xt_connlimit_data *data,
 	int matches = 0;
 
 
-	if (match->family == AF_INET6)
+	if (match->family == NFPROTO_IPV6)
 		hash = &data->iphash[connlimit_iphash6(addr, mask)];
 	else
 		hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)];
@@ -198,7 +198,7 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in,
 				    match->family, &tuple))
 		goto hotdrop;
 
-	if (match->family == AF_INET6) {
+	if (match->family == NFPROTO_IPV6) {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
 		memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr));
 	} else {
@@ -276,7 +276,7 @@ connlimit_mt_destroy(const struct xt_match *match, void *matchinfo)
 static struct xt_match connlimit_mt_reg[] __read_mostly = {
 	{
 		.name       = "connlimit",
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.checkentry = connlimit_mt_check,
 		.match      = connlimit_mt,
 		.matchsize  = sizeof(struct xt_connlimit_info),
@@ -285,7 +285,7 @@ static struct xt_match connlimit_mt_reg[] __read_mostly = {
 	},
 	{
 		.name       = "connlimit",
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.checkentry = connlimit_mt_check,
 		.match      = connlimit_mt,
 		.matchsize  = sizeof(struct xt_connlimit_info),
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index aaa1b96691f..0577b8ff4e1 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -140,7 +140,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = {
 	{
 		.name		= "connmark",
 		.revision	= 0,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= connmark_mt_check_v0,
 		.match		= connmark_mt_v0,
 		.destroy	= connmark_mt_destroy,
@@ -155,7 +155,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = {
 	{
 		.name		= "connmark",
 		.revision	= 0,
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= connmark_mt_check_v0,
 		.match		= connmark_mt_v0,
 		.destroy	= connmark_mt_destroy,
@@ -170,7 +170,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = {
 	{
 		.name           = "connmark",
 		.revision       = 1,
-		.family         = AF_INET,
+		.family         = NFPROTO_IPV4,
 		.checkentry     = connmark_mt_check,
 		.match          = connmark_mt,
 		.matchsize      = sizeof(struct xt_connmark_mtinfo1),
@@ -180,7 +180,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = {
 	{
 		.name           = "connmark",
 		.revision       = 1,
-		.family         = AF_INET6,
+		.family         = NFPROTO_IPV6,
 		.checkentry     = connmark_mt_check,
 		.match          = connmark_mt,
 		.matchsize      = sizeof(struct xt_connmark_mtinfo1),
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 28a42a3fbff..392b457f9c2 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -121,9 +121,9 @@ conntrack_addrcmp(const union nf_inet_addr *kaddr,
                   const union nf_inet_addr *uaddr,
                   const union nf_inet_addr *umask, unsigned int l3proto)
 {
-	if (l3proto == AF_INET)
+	if (l3proto == NFPROTO_IPV4)
 		return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
-	else if (l3proto == AF_INET6)
+	else if (l3proto == NFPROTO_IPV6)
 		return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
 		       &uaddr->in6) == 0;
 	else
@@ -356,7 +356,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
 	{
 		.name       = "conntrack",
 		.revision   = 0,
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.match      = conntrack_mt_v0,
 		.checkentry = conntrack_mt_check,
 		.destroy    = conntrack_mt_destroy,
@@ -371,7 +371,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
 	{
 		.name       = "conntrack",
 		.revision   = 1,
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
 		.match      = conntrack_mt,
 		.checkentry = conntrack_mt_check,
@@ -381,7 +381,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
 	{
 		.name       = "conntrack",
 		.revision   = 1,
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
 		.match      = conntrack_mt,
 		.checkentry = conntrack_mt_check,
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index 8b6522186d9..87971f47132 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -138,7 +138,7 @@ dccp_mt_check(const char *tablename, const void *inf,
 static struct xt_match dccp_mt_reg[] __read_mostly = {
 	{
 		.name 		= "dccp",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= dccp_mt_check,
 		.match		= dccp_mt,
 		.matchsize	= sizeof(struct xt_dccp_info),
@@ -147,7 +147,7 @@ static struct xt_match dccp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name 		= "dccp",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= dccp_mt_check,
 		.match		= dccp_mt,
 		.matchsize	= sizeof(struct xt_dccp_info),
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 26f4aab9c42..7f03aa13a95 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -80,7 +80,7 @@ static bool tos_mt(const struct sk_buff *skb, const struct net_device *in,
 {
 	const struct xt_tos_match_info *info = matchinfo;
 
-	if (match->family == AF_INET)
+	if (match->family == NFPROTO_IPV4)
 		return ((ip_hdr(skb)->tos & info->tos_mask) ==
 		       info->tos_value) ^ !!info->invert;
 	else
@@ -91,7 +91,7 @@ static bool tos_mt(const struct sk_buff *skb, const struct net_device *in,
 static struct xt_match dscp_mt_reg[] __read_mostly = {
 	{
 		.name		= "dscp",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= dscp_mt_check,
 		.match		= dscp_mt,
 		.matchsize	= sizeof(struct xt_dscp_info),
@@ -99,7 +99,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "dscp",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= dscp_mt_check,
 		.match		= dscp_mt6,
 		.matchsize	= sizeof(struct xt_dscp_info),
@@ -108,7 +108,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = {
 	{
 		.name		= "tos",
 		.revision	= 0,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= tos_mt_v0,
 		.matchsize	= sizeof(struct ipt_tos_info),
 		.me		= THIS_MODULE,
@@ -116,7 +116,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = {
 	{
 		.name		= "tos",
 		.revision	= 1,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= tos_mt,
 		.matchsize	= sizeof(struct xt_tos_match_info),
 		.me		= THIS_MODULE,
@@ -124,7 +124,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = {
 	{
 		.name		= "tos",
 		.revision	= 1,
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= tos_mt,
 		.matchsize	= sizeof(struct xt_tos_match_info),
 		.me		= THIS_MODULE,
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index a133eb9b23e..045c4deecaf 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -88,7 +88,7 @@ esp_mt_check(const char *tablename, const void *ip_void,
 static struct xt_match esp_mt_reg[] __read_mostly = {
 	{
 		.name		= "esp",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= esp_mt_check,
 		.match		= esp_mt,
 		.matchsize	= sizeof(struct xt_esp),
@@ -97,7 +97,7 @@ static struct xt_match esp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "esp",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= esp_mt_check,
 		.match		= esp_mt,
 		.matchsize	= sizeof(struct xt_esp),
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 0c9268fd2e1..7bae369603d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -218,7 +218,7 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family)
 	hinfo->cfg.gc_interval = minfo->cfg.gc_interval;
 	hinfo->cfg.expire      = minfo->cfg.expire;
 
-	if (family == AF_INET)
+	if (family == NFPROTO_IPV4)
 		hinfo->cfg.srcmask = hinfo->cfg.dstmask = 32;
 	else
 		hinfo->cfg.srcmask = hinfo->cfg.dstmask = 128;
@@ -237,11 +237,10 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family)
 	hinfo->family = family;
 	hinfo->rnd_initialized = 0;
 	spin_lock_init(&hinfo->lock);
-	hinfo->pde =
-		proc_create_data(minfo->name, 0,
-				 family == AF_INET ? hashlimit_procdir4 :
-						     hashlimit_procdir6,
-				 &dl_file_ops, hinfo);
+	hinfo->pde = proc_create_data(minfo->name, 0,
+		(family == NFPROTO_IPV4) ?
+		hashlimit_procdir4 : hashlimit_procdir6,
+		&dl_file_ops, hinfo);
 	if (!hinfo->pde) {
 		vfree(hinfo);
 		return -1;
@@ -300,11 +299,10 @@ static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family)
 	hinfo->rnd_initialized = 0;
 	spin_lock_init(&hinfo->lock);
 
-	hinfo->pde =
-		proc_create_data(minfo->name, 0,
-				 family == AF_INET ? hashlimit_procdir4 :
-						     hashlimit_procdir6,
-				 &dl_file_ops, hinfo);
+	hinfo->pde = proc_create_data(minfo->name, 0,
+		(family == NFPROTO_IPV4) ?
+		hashlimit_procdir4 : hashlimit_procdir6,
+		&dl_file_ops, hinfo);
 	if (hinfo->pde == NULL) {
 		vfree(hinfo);
 		return -1;
@@ -370,7 +368,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo)
 
 	/* remove proc entry */
 	remove_proc_entry(hinfo->pde->name,
-			  hinfo->family == AF_INET ? hashlimit_procdir4 :
+			  hinfo->family == NFPROTO_IPV4 ? hashlimit_procdir4 :
 						     hashlimit_procdir6);
 	htable_selective_cleanup(hinfo, select_all);
 	vfree(hinfo);
@@ -501,7 +499,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 	memset(dst, 0, sizeof(*dst));
 
 	switch (hinfo->family) {
-	case AF_INET:
+	case NFPROTO_IPV4:
 		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP)
 			dst->ip.dst = maskl(ip_hdr(skb)->daddr,
 			              hinfo->cfg.dstmask);
@@ -515,7 +513,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 		nexthdr = ip_hdr(skb)->protocol;
 		break;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-	case AF_INET6:
+	case NFPROTO_IPV6:
 		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) {
 			memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr,
 			       sizeof(dst->ip6.dst));
@@ -737,7 +735,7 @@ hashlimit_mt_check(const char *tablename, const void *inf,
 		return false;
 	if (info->name[sizeof(info->name)-1] != '\0')
 		return false;
-	if (match->family == AF_INET) {
+	if (match->family == NFPROTO_IPV4) {
 		if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
 			return false;
 	} else {
@@ -805,7 +803,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 	{
 		.name		= "hashlimit",
 		.revision	= 0,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= hashlimit_mt_v0,
 		.matchsize	= sizeof(struct xt_hashlimit_info),
 #ifdef CONFIG_COMPAT
@@ -820,7 +818,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 	{
 		.name           = "hashlimit",
 		.revision       = 1,
-		.family         = AF_INET,
+		.family         = NFPROTO_IPV4,
 		.match          = hashlimit_mt,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
 		.checkentry     = hashlimit_mt_check,
@@ -830,7 +828,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	{
 		.name		= "hashlimit",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= hashlimit_mt_v0,
 		.matchsize	= sizeof(struct xt_hashlimit_info),
 #ifdef CONFIG_COMPAT
@@ -845,7 +843,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 	{
 		.name           = "hashlimit",
 		.revision       = 1,
-		.family         = AF_INET6,
+		.family         = NFPROTO_IPV6,
 		.match          = hashlimit_mt,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
 		.checkentry     = hashlimit_mt_check,
@@ -907,7 +905,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 	rateinfo_recalc(ent, jiffies);
 
 	switch (family) {
-	case AF_INET:
+	case NFPROTO_IPV4:
 		return seq_printf(s, "%ld %u.%u.%u.%u:%u->"
 				     "%u.%u.%u.%u:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
@@ -918,7 +916,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-	case AF_INET6:
+	case NFPROTO_IPV6:
 		return seq_printf(s, "%ld " NIP6_FMT ":%u->"
 				     NIP6_FMT ":%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index dada2905d66..134d94324eb 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -81,7 +81,7 @@ static void helper_mt_destroy(const struct xt_match *match, void *matchinfo)
 static struct xt_match helper_mt_reg[] __read_mostly = {
 	{
 		.name		= "helper",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= helper_mt_check,
 		.match		= helper_mt,
 		.destroy	= helper_mt_destroy,
@@ -90,7 +90,7 @@ static struct xt_match helper_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "helper",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= helper_mt_check,
 		.match		= helper_mt,
 		.destroy	= helper_mt_destroy,
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index c63e9333c75..a7498cc48dc 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -141,7 +141,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = {
 	{
 		.name      = "iprange",
 		.revision  = 0,
-		.family    = AF_INET,
+		.family    = NFPROTO_IPV4,
 		.match     = iprange_mt_v0,
 		.matchsize = sizeof(struct ipt_iprange_info),
 		.me        = THIS_MODULE,
@@ -149,7 +149,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = {
 	{
 		.name      = "iprange",
 		.revision  = 1,
-		.family    = AF_INET,
+		.family    = NFPROTO_IPV4,
 		.match     = iprange_mt4,
 		.matchsize = sizeof(struct xt_iprange_mtinfo),
 		.me        = THIS_MODULE,
@@ -157,7 +157,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = {
 	{
 		.name      = "iprange",
 		.revision  = 1,
-		.family    = AF_INET6,
+		.family    = NFPROTO_IPV6,
 		.match     = iprange_mt6,
 		.matchsize = sizeof(struct xt_iprange_mtinfo),
 		.me        = THIS_MODULE,
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index b8640f97295..b8612d1914b 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -48,14 +48,14 @@ length_mt6(const struct sk_buff *skb, const struct net_device *in,
 static struct xt_match length_mt_reg[] __read_mostly = {
 	{
 		.name		= "length",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= length_mt,
 		.matchsize	= sizeof(struct xt_length_info),
 		.me		= THIS_MODULE,
 	},
 	{
 		.name		= "length",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= length_mt6,
 		.matchsize	= sizeof(struct xt_length_info),
 		.me		= THIS_MODULE,
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index aad9ab8d204..584d66893c4 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -170,7 +170,7 @@ static int limit_mt_compat_to_user(void __user *dst, void *src)
 static struct xt_match limit_mt_reg[] __read_mostly = {
 	{
 		.name		= "limit",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= limit_mt_check,
 		.match		= limit_mt,
 		.matchsize	= sizeof(struct xt_rateinfo),
@@ -183,7 +183,7 @@ static struct xt_match limit_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "limit",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= limit_mt_check,
 		.match		= limit_mt,
 		.matchsize	= sizeof(struct xt_rateinfo),
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index b3e96a0ec17..60db240098a 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -42,7 +42,7 @@ mac_mt(const struct sk_buff *skb, const struct net_device *in,
 static struct xt_match mac_mt_reg[] __read_mostly = {
 	{
 		.name		= "mac",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= mac_mt,
 		.matchsize	= sizeof(struct xt_mac_info),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
@@ -52,7 +52,7 @@ static struct xt_match mac_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "mac",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= mac_mt,
 		.matchsize	= sizeof(struct xt_mac_info),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 9f78f6120fb..c66affd5722 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -92,7 +92,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = {
 	{
 		.name		= "mark",
 		.revision	= 0,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= mark_mt_check_v0,
 		.match		= mark_mt_v0,
 		.matchsize	= sizeof(struct xt_mark_info),
@@ -106,7 +106,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = {
 	{
 		.name		= "mark",
 		.revision	= 0,
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= mark_mt_check_v0,
 		.match		= mark_mt_v0,
 		.matchsize	= sizeof(struct xt_mark_info),
@@ -120,7 +120,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = {
 	{
 		.name           = "mark",
 		.revision       = 1,
-		.family         = AF_INET,
+		.family         = NFPROTO_IPV4,
 		.match          = mark_mt,
 		.matchsize      = sizeof(struct xt_mark_mtinfo1),
 		.me             = THIS_MODULE,
@@ -128,7 +128,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = {
 	{
 		.name           = "mark",
 		.revision       = 1,
-		.family         = AF_INET6,
+		.family         = NFPROTO_IPV6,
 		.match          = mark_mt,
 		.matchsize      = sizeof(struct xt_mark_mtinfo1),
 		.me             = THIS_MODULE,
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index fd88c489b70..f6fe008ab8c 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -216,7 +216,7 @@ multiport_mt6_check(const char *tablename, const void *info,
 static struct xt_match multiport_mt_reg[] __read_mostly = {
 	{
 		.name		= "multiport",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.revision	= 0,
 		.checkentry	= multiport_mt_check_v0,
 		.match		= multiport_mt_v0,
@@ -225,7 +225,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "multiport",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.revision	= 1,
 		.checkentry	= multiport_mt_check,
 		.match		= multiport_mt,
@@ -234,7 +234,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "multiport",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.revision	= 0,
 		.checkentry	= multiport_mt6_check_v0,
 		.match		= multiport_mt_v0,
@@ -243,7 +243,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "multiport",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.revision	= 1,
 		.checkentry	= multiport_mt6_check,
 		.match		= multiport_mt,
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 9059c16144c..d1c3b7ae9b4 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -153,7 +153,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = {
 	{
 		.name       = "owner",
 		.revision   = 0,
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.match      = owner_mt_v0,
 		.matchsize  = sizeof(struct ipt_owner_info),
 		.checkentry = owner_mt_check_v0,
@@ -164,7 +164,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = {
 	{
 		.name       = "owner",
 		.revision   = 0,
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.match      = owner_mt6_v0,
 		.matchsize  = sizeof(struct ip6t_owner_info),
 		.checkentry = owner_mt6_check_v0,
@@ -175,7 +175,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = {
 	{
 		.name       = "owner",
 		.revision   = 1,
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.match      = owner_mt,
 		.matchsize  = sizeof(struct xt_owner_match_info),
 		.hooks      = (1 << NF_INET_LOCAL_OUT) |
@@ -185,7 +185,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = {
 	{
 		.name       = "owner",
 		.revision   = 1,
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.match      = owner_mt,
 		.matchsize  = sizeof(struct xt_owner_match_info),
 		.hooks      = (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 4ec1094bda9..72a0bdd53fa 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -121,7 +121,7 @@ physdev_mt_check(const char *tablename, const void *ip,
 static struct xt_match physdev_mt_reg[] __read_mostly = {
 	{
 		.name		= "physdev",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= physdev_mt_check,
 		.match		= physdev_mt,
 		.matchsize	= sizeof(struct xt_physdev_info),
@@ -129,7 +129,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "physdev",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= physdev_mt_check,
 		.match		= physdev_mt,
 		.matchsize	= sizeof(struct xt_physdev_info),
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 7936f7e2325..81e86d319a8 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -33,10 +33,10 @@ pkttype_mt(const struct sk_buff *skb, const struct net_device *in,
 
 	if (skb->pkt_type != PACKET_LOOPBACK)
 		type = skb->pkt_type;
-	else if (match->family == AF_INET &&
+	else if (match->family == NFPROTO_IPV4 &&
 	    ipv4_is_multicast(ip_hdr(skb)->daddr))
 		type = PACKET_MULTICAST;
-	else if (match->family == AF_INET6 &&
+	else if (match->family == NFPROTO_IPV6 &&
 	    ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
 		type = PACKET_MULTICAST;
 	else
@@ -48,14 +48,14 @@ pkttype_mt(const struct sk_buff *skb, const struct net_device *in,
 static struct xt_match pkttype_mt_reg[] __read_mostly = {
 	{
 		.name		= "pkttype",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= pkttype_mt,
 		.matchsize	= sizeof(struct xt_pkttype_info),
 		.me		= THIS_MODULE,
 	},
 	{
 		.name		= "pkttype",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= pkttype_mt,
 		.matchsize	= sizeof(struct xt_pkttype_info),
 		.me		= THIS_MODULE,
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index d351582b2a3..f1d514e9d0a 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -26,9 +26,9 @@ xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m,
 	    const union nf_inet_addr *a2, unsigned short family)
 {
 	switch (family) {
-	case AF_INET:
+	case NFPROTO_IPV4:
 		return ((a1->ip ^ a2->ip) & m->ip) == 0;
-	case AF_INET6:
+	case NFPROTO_IPV6:
 		return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0;
 	}
 	return false;
@@ -165,7 +165,7 @@ policy_mt_check(const char *tablename, const void *ip_void,
 static struct xt_match policy_mt_reg[] __read_mostly = {
 	{
 		.name		= "policy",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry 	= policy_mt_check,
 		.match		= policy_mt,
 		.matchsize	= sizeof(struct xt_policy_info),
@@ -173,7 +173,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "policy",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= policy_mt_check,
 		.match		= policy_mt,
 		.matchsize	= sizeof(struct xt_policy_info),
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 3b021d0c522..59f61e32b62 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -57,7 +57,7 @@ quota_mt_check(const char *tablename, const void *entry,
 static struct xt_match quota_mt_reg[] __read_mostly = {
 	{
 		.name		= "quota",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= quota_mt_check,
 		.match		= quota_mt,
 		.matchsize	= sizeof(struct xt_quota_info),
@@ -65,7 +65,7 @@ static struct xt_match quota_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "quota",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= quota_mt_check,
 		.match		= quota_mt,
 		.matchsize	= sizeof(struct xt_quota_info),
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index ebd84f1b4f6..ba1cb5760f4 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -139,7 +139,7 @@ static void xt_rateest_mt_destroy(const struct xt_match *match,
 
 static struct xt_match xt_rateest_match[] __read_mostly = {
 	{
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.name		= "rateest",
 		.match		= xt_rateest_mt,
 		.checkentry	= xt_rateest_mt_checkentry,
@@ -148,7 +148,7 @@ static struct xt_match xt_rateest_match[] __read_mostly = {
 		.me		= THIS_MODULE,
 	},
 	{
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.name		= "rateest",
 		.match		= xt_rateest_mt,
 		.checkentry	= xt_rateest_mt_checkentry,
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 7df1627c536..ef65756d489 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -39,7 +39,7 @@ static struct xt_match realm_mt_reg __read_mostly = {
 	.matchsize	= sizeof(struct xt_realm_info),
 	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) |
 			  (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN),
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.me		= THIS_MODULE
 };
 
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index adc2e2f1b09..4a916e2624d 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -124,7 +124,7 @@ recent_entry_lookup(const struct recent_table *table,
 	struct recent_entry *e;
 	unsigned int h;
 
-	if (family == AF_INET)
+	if (family == NFPROTO_IPV4)
 		h = recent_entry_hash4(addrp);
 	else
 		h = recent_entry_hash6(addrp);
@@ -165,7 +165,7 @@ recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr,
 	e->nstamps   = 1;
 	e->index     = 1;
 	e->family    = family;
-	if (family == AF_INET)
+	if (family == NFPROTO_IPV4)
 		list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]);
 	else
 		list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]);
@@ -216,7 +216,7 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in,
 	u_int8_t ttl;
 	bool ret = info->invert;
 
-	if (match->family == AF_INET) {
+	if (match->family == NFPROTO_IPV4) {
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (info->side == XT_RECENT_DEST)
@@ -429,7 +429,7 @@ static int recent_seq_show(struct seq_file *seq, void *v)
 	unsigned int i;
 
 	i = (e->index - 1) % ip_pkt_list_tot;
-	if (e->family == AF_INET)
+	if (e->family == NFPROTO_IPV4)
 		seq_printf(seq, "src=" NIPQUAD_FMT " ttl: %u last_seen: %lu "
 			   "oldest_pkt: %u", NIPQUAD(e->addr.ip), e->ttl,
 			   e->stamps[i], e->index);
@@ -519,10 +519,11 @@ static ssize_t recent_old_proc_write(struct file *file,
 	addr = in_aton(c);
 
 	spin_lock_bh(&recent_lock);
-	e = recent_entry_lookup(t, (const void *)&addr, PF_INET, 0);
+	e = recent_entry_lookup(t, (const void *)&addr, NFPROTO_IPV4, 0);
 	if (e == NULL) {
 		if (add)
-			recent_entry_init(t, (const void *)&addr, PF_INET, 0);
+			recent_entry_init(t, (const void *)&addr,
+					  NFPROTO_IPV4, 0);
 	} else {
 		if (add)
 			recent_entry_update(t, e);
@@ -585,10 +586,10 @@ recent_mt_proc_write(struct file *file, const char __user *input,
 	++c;
 	--size;
 	if (strnchr(c, size, ':') != NULL) {
-		family = AF_INET6;
+		family = NFPROTO_IPV6;
 		succ   = in6_pton(c, size, (void *)&addr, '\n', NULL);
 	} else {
-		family = AF_INET;
+		family = NFPROTO_IPV4;
 		succ   = in4_pton(c, size, (void *)&addr, '\n', NULL);
 	}
 
@@ -628,7 +629,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
 	{
 		.name       = "recent",
 		.revision   = 0,
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.match      = recent_mt,
 		.matchsize  = sizeof(struct xt_recent_mtinfo),
 		.checkentry = recent_mt_check,
@@ -638,7 +639,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
 	{
 		.name       = "recent",
 		.revision   = 0,
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.match      = recent_mt,
 		.matchsize  = sizeof(struct xt_recent_mtinfo),
 		.checkentry = recent_mt_check,
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index e6e4681fa04..ab67aca4d8f 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -169,7 +169,7 @@ sctp_mt_check(const char *tablename, const void *inf,
 static struct xt_match sctp_mt_reg[] __read_mostly = {
 	{
 		.name		= "sctp",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= sctp_mt_check,
 		.match		= sctp_mt,
 		.matchsize	= sizeof(struct xt_sctp_info),
@@ -178,7 +178,7 @@ static struct xt_match sctp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "sctp",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= sctp_mt_check,
 		.match		= sctp_mt,
 		.matchsize	= sizeof(struct xt_sctp_info),
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index a776dc36a19..f92f8bcc1e3 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -61,7 +61,7 @@ static void state_mt_destroy(const struct xt_match *match, void *matchinfo)
 static struct xt_match state_mt_reg[] __read_mostly = {
 	{
 		.name		= "state",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= state_mt_check,
 		.match		= state_mt,
 		.destroy	= state_mt_destroy,
@@ -70,7 +70,7 @@ static struct xt_match state_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "state",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= state_mt_check,
 		.match		= state_mt,
 		.destroy	= state_mt_destroy,
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 43133080da7..fd3bb1400df 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -69,7 +69,7 @@ statistic_mt_check(const char *tablename, const void *entry,
 static struct xt_match statistic_mt_reg[] __read_mostly = {
 	{
 		.name		= "statistic",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= statistic_mt_check,
 		.match		= statistic_mt,
 		.matchsize	= sizeof(struct xt_statistic_info),
@@ -77,7 +77,7 @@ static struct xt_match statistic_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "statistic",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= statistic_mt_check,
 		.match		= statistic_mt,
 		.matchsize	= sizeof(struct xt_statistic_info),
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 4903182a062..50169718377 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -85,7 +85,7 @@ static struct xt_match string_mt_reg[] __read_mostly = {
 	{
 		.name 		= "string",
 		.revision	= 0,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= string_mt_check,
 		.match 		= string_mt,
 		.destroy 	= string_mt_destroy,
@@ -95,7 +95,7 @@ static struct xt_match string_mt_reg[] __read_mostly = {
 	{
 		.name 		= "string",
 		.revision	= 1,
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= string_mt_check,
 		.match 		= string_mt,
 		.destroy 	= string_mt_destroy,
@@ -105,7 +105,7 @@ static struct xt_match string_mt_reg[] __read_mostly = {
 	{
 		.name 		= "string",
 		.revision	= 0,
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= string_mt_check,
 		.match 		= string_mt,
 		.destroy 	= string_mt_destroy,
@@ -115,7 +115,7 @@ static struct xt_match string_mt_reg[] __read_mostly = {
 	{
 		.name 		= "string",
 		.revision	= 1,
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= string_mt_check,
 		.match 		= string_mt,
 		.destroy 	= string_mt_destroy,
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 6771bf01275..4791c7cbe5a 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -83,7 +83,7 @@ dropit:
 static struct xt_match tcpmss_mt_reg[] __read_mostly = {
 	{
 		.name		= "tcpmss",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.match		= tcpmss_mt,
 		.matchsize	= sizeof(struct xt_tcpmss_match_info),
 		.proto		= IPPROTO_TCP,
@@ -91,7 +91,7 @@ static struct xt_match tcpmss_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "tcpmss",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.match		= tcpmss_mt,
 		.matchsize	= sizeof(struct xt_tcpmss_match_info),
 		.proto		= IPPROTO_TCP,
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 951b06b8d70..5a6268cbb9f 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -186,7 +186,7 @@ udp_mt_check(const char *tablename, const void *info,
 static struct xt_match tcpudp_mt_reg[] __read_mostly = {
 	{
 		.name		= "tcp",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= tcp_mt_check,
 		.match		= tcp_mt,
 		.matchsize	= sizeof(struct xt_tcp),
@@ -195,7 +195,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "tcp",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= tcp_mt_check,
 		.match		= tcp_mt,
 		.matchsize	= sizeof(struct xt_tcp),
@@ -204,7 +204,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "udp",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= udp_mt_check,
 		.match		= udp_mt,
 		.matchsize	= sizeof(struct xt_udp),
@@ -213,7 +213,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "udp",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= udp_mt_check,
 		.match		= udp_mt,
 		.matchsize	= sizeof(struct xt_udp),
@@ -222,7 +222,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "udplite",
-		.family		= AF_INET,
+		.family		= NFPROTO_IPV4,
 		.checkentry	= udp_mt_check,
 		.match		= udp_mt,
 		.matchsize	= sizeof(struct xt_udp),
@@ -231,7 +231,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "udplite",
-		.family		= AF_INET6,
+		.family		= NFPROTO_IPV6,
 		.checkentry	= udp_mt_check,
 		.match		= udp_mt,
 		.matchsize	= sizeof(struct xt_udp),
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 307a2c3c2df..fe9dae2b4f5 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -240,7 +240,7 @@ time_mt_check(const char *tablename, const void *ip,
 static struct xt_match time_mt_reg[] __read_mostly = {
 	{
 		.name       = "time",
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.match      = time_mt,
 		.matchsize  = sizeof(struct xt_time_info),
 		.checkentry = time_mt_check,
@@ -248,7 +248,7 @@ static struct xt_match time_mt_reg[] __read_mostly = {
 	},
 	{
 		.name       = "time",
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.match      = time_mt,
 		.matchsize  = sizeof(struct xt_time_info),
 		.checkentry = time_mt_check,
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
index 627e0f336d5..ed9f8340611 100644
--- a/net/netfilter/xt_u32.c
+++ b/net/netfilter/xt_u32.c
@@ -102,14 +102,14 @@ u32_mt(const struct sk_buff *skb, const struct net_device *in,
 static struct xt_match u32_mt_reg[] __read_mostly = {
 	{
 		.name       = "u32",
-		.family     = AF_INET,
+		.family     = NFPROTO_IPV4,
 		.match      = u32_mt,
 		.matchsize  = sizeof(struct xt_u32),
 		.me         = THIS_MODULE,
 	},
 	{
 		.name       = "u32",
-		.family     = AF_INET6,
+		.family     = NFPROTO_IPV6,
 		.match      = u32_mt,
 		.matchsize  = sizeof(struct xt_u32),
 		.me         = THIS_MODULE,
-- 
cgit v1.2.3


From 55b69e91040c685a064198bd76e59885b7ad26c6 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Wed, 8 Oct 2008 11:35:01 +0200
Subject: netfilter: implement NFPROTO_UNSPEC as a wildcard for extensions

When a match or target is looked up using xt_find_{match,target},
Xtables will also search the NFPROTO_UNSPEC module list. This allows
for protocol-independent extensions (like xt_time) to be reused from
other components (e.g. arptables, ebtables).

Extensions that take different codepaths depending on match->family
or target->family of course cannot use NFPROTO_UNSPEC within the
registration structure (e.g. xt_pkttype).

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/x_tables.c     | 10 ++++++++++
 net/netfilter/xt_CLASSIFY.c  | 38 ++++++++++++--------------------------
 net/netfilter/xt_MARK.c      | 10 +---------
 net/netfilter/xt_RATEEST.c   | 33 +++++++++++----------------------
 net/netfilter/xt_SECMARK.c   | 32 +++++++++++---------------------
 net/netfilter/xt_TRACE.c     | 26 +++++++++-----------------
 net/netfilter/xt_limit.c     | 40 +++++++++++++---------------------------
 net/netfilter/xt_mark.c      | 26 ++------------------------
 net/netfilter/xt_quota.c     | 29 ++++++++++-------------------
 net/netfilter/xt_rateest.c   | 33 +++++++++++----------------------
 net/netfilter/xt_statistic.c | 31 ++++++++++---------------------
 net/netfilter/xt_string.c    | 31 ++++++-------------------------
 net/netfilter/xt_time.c      | 28 +++++++++-------------------
 net/netfilter/xt_u32.c       | 26 +++++++++-----------------
 14 files changed, 124 insertions(+), 269 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2a7eb1da5d0..aece6c2d134 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -209,6 +209,11 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
 		}
 	}
 	mutex_unlock(&xt[af].mutex);
+
+	if (af != NFPROTO_UNSPEC)
+		/* Try searching again in the family-independent list */
+		return xt_find_match(NFPROTO_UNSPEC, name, revision);
+
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(xt_find_match);
@@ -234,6 +239,11 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
 		}
 	}
 	mutex_unlock(&xt[af].mutex);
+
+	if (af != NFPROTO_UNSPEC)
+		/* Try searching again in the family-independent list */
+		return xt_find_target(NFPROTO_UNSPEC, name, revision);
+
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(xt_find_target);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index 9d68da1748b..8cffa295dd3 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -37,40 +37,26 @@ classify_tg(struct sk_buff *skb, const struct net_device *in,
 	return XT_CONTINUE;
 }
 
-static struct xt_target classify_tg_reg[] __read_mostly = {
-	{
-		.family		= NFPROTO_IPV4,
-		.name 		= "CLASSIFY",
-		.target 	= classify_tg,
-		.targetsize	= sizeof(struct xt_classify_target_info),
-		.table		= "mangle",
-		.hooks		= (1 << NF_INET_LOCAL_OUT) |
-				  (1 << NF_INET_FORWARD) |
-				  (1 << NF_INET_POST_ROUTING),
-		.me 		= THIS_MODULE,
-	},
-	{
-		.name 		= "CLASSIFY",
-		.family		= NFPROTO_IPV6,
-		.target 	= classify_tg,
-		.targetsize	= sizeof(struct xt_classify_target_info),
-		.table		= "mangle",
-		.hooks		= (1 << NF_INET_LOCAL_OUT) |
-				  (1 << NF_INET_FORWARD) |
-				  (1 << NF_INET_POST_ROUTING),
-		.me 		= THIS_MODULE,
-	},
+static struct xt_target classify_tg_reg __read_mostly = {
+	.name       = "CLASSIFY",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.table      = "mangle",
+	.hooks      = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+		      (1 << NF_INET_POST_ROUTING),
+	.target     = classify_tg,
+	.targetsize = sizeof(struct xt_classify_target_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init classify_tg_init(void)
 {
-	return xt_register_targets(classify_tg_reg,
-	       ARRAY_SIZE(classify_tg_reg));
+	return xt_register_target(&classify_tg_reg);
 }
 
 static void __exit classify_tg_exit(void)
 {
-	xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
+	xt_unregister_target(&classify_tg_reg);
 }
 
 module_init(classify_tg_init);
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index 55ef0796c76..c8ea7a80970 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -222,15 +222,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
 	{
 		.name           = "MARK",
 		.revision       = 2,
-		.family         = NFPROTO_IPV4,
-		.target         = mark_tg,
-		.targetsize     = sizeof(struct xt_mark_tginfo2),
-		.me             = THIS_MODULE,
-	},
-	{
-		.name           = "MARK",
-		.revision       = 2,
-		.family         = NFPROTO_IPV6,
+		.family         = NFPROTO_UNSPEC,
 		.target         = mark_tg,
 		.targetsize     = sizeof(struct xt_mark_tginfo2),
 		.me             = THIS_MODULE,
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index f7114fc5cfc..da7946e6ecb 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -157,25 +157,15 @@ static void xt_rateest_tg_destroy(const struct xt_target *target,
 	xt_rateest_put(info->est);
 }
 
-static struct xt_target xt_rateest_target[] __read_mostly = {
-	{
-		.family		= NFPROTO_IPV4,
-		.name		= "RATEEST",
-		.target		= xt_rateest_tg,
-		.checkentry	= xt_rateest_tg_checkentry,
-		.destroy	= xt_rateest_tg_destroy,
-		.targetsize	= sizeof(struct xt_rateest_target_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.family		= NFPROTO_IPV6,
-		.name		= "RATEEST",
-		.target		= xt_rateest_tg,
-		.checkentry	= xt_rateest_tg_checkentry,
-		.destroy	= xt_rateest_tg_destroy,
-		.targetsize	= sizeof(struct xt_rateest_target_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_target xt_rateest_tg_reg __read_mostly = {
+	.name       = "RATEEST",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.target     = xt_rateest_tg,
+	.checkentry = xt_rateest_tg_checkentry,
+	.destroy    = xt_rateest_tg_destroy,
+	.targetsize = sizeof(struct xt_rateest_target_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init xt_rateest_tg_init(void)
@@ -186,13 +176,12 @@ static int __init xt_rateest_tg_init(void)
 		INIT_HLIST_HEAD(&rateest_hash[i]);
 
 	get_random_bytes(&jhash_rnd, sizeof(jhash_rnd));
-	return xt_register_targets(xt_rateest_target,
-				   ARRAY_SIZE(xt_rateest_target));
+	return xt_register_target(&xt_rateest_tg_reg);
 }
 
 static void __exit xt_rateest_tg_fini(void)
 {
-	xt_unregister_targets(xt_rateest_target, ARRAY_SIZE(xt_rateest_target));
+	xt_unregister_target(&xt_rateest_tg_reg);
 }
 
 
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 8f8f57b93a6..2a2ab833481 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -125,35 +125,25 @@ static void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
 	}
 }
 
-static struct xt_target secmark_tg_reg[] __read_mostly = {
-	{
-		.name		= "SECMARK",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= secmark_tg_check,
-		.destroy	= secmark_tg_destroy,
-		.target		= secmark_tg,
-		.targetsize	= sizeof(struct xt_secmark_target_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "SECMARK",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= secmark_tg_check,
-		.destroy	= secmark_tg_destroy,
-		.target		= secmark_tg,
-		.targetsize	= sizeof(struct xt_secmark_target_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_target secmark_tg_reg __read_mostly = {
+	.name       = "SECMARK",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = secmark_tg_check,
+	.destroy    = secmark_tg_destroy,
+	.target     = secmark_tg,
+	.targetsize = sizeof(struct xt_secmark_target_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init secmark_tg_init(void)
 {
-	return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
+	return xt_register_target(&secmark_tg_reg);
 }
 
 static void __exit secmark_tg_exit(void)
 {
-	xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
+	xt_unregister_target(&secmark_tg_reg);
 }
 
 module_init(secmark_tg_init);
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index e1bcad57914..da35f9f1cd7 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -19,31 +19,23 @@ trace_tg(struct sk_buff *skb, const struct net_device *in,
 	return XT_CONTINUE;
 }
 
-static struct xt_target trace_tg_reg[] __read_mostly = {
-	{
-		.name		= "TRACE",
-		.family		= NFPROTO_IPV4,
-		.target		= trace_tg,
-		.table		= "raw",
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "TRACE",
-		.family		= NFPROTO_IPV6,
-		.target		= trace_tg,
-		.table		= "raw",
-		.me		= THIS_MODULE,
-	},
+static struct xt_target trace_tg_reg __read_mostly = {
+	.name       = "TRACE",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.table      = "raw",
+	.target     = trace_tg,
+	.me         = THIS_MODULE,
 };
 
 static int __init trace_tg_init(void)
 {
-	return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
+	return xt_register_target(&trace_tg_reg);
 }
 
 static void __exit trace_tg_exit(void)
 {
-	xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
+	xt_unregister_target(&trace_tg_reg);
 }
 
 module_init(trace_tg_init);
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 584d66893c4..00247bd1095 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -167,43 +167,29 @@ static int limit_mt_compat_to_user(void __user *dst, void *src)
 }
 #endif /* CONFIG_COMPAT */
 
-static struct xt_match limit_mt_reg[] __read_mostly = {
-	{
-		.name		= "limit",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= limit_mt_check,
-		.match		= limit_mt,
-		.matchsize	= sizeof(struct xt_rateinfo),
+static struct xt_match limit_mt_reg __read_mostly = {
+	.name             = "limit",
+	.revision         = 0,
+	.family           = NFPROTO_UNSPEC,
+	.match            = limit_mt,
+	.checkentry       = limit_mt_check,
+	.matchsize        = sizeof(struct xt_rateinfo),
 #ifdef CONFIG_COMPAT
-		.compatsize	= sizeof(struct compat_xt_rateinfo),
-		.compat_from_user = limit_mt_compat_from_user,
-		.compat_to_user	= limit_mt_compat_to_user,
+	.compatsize       = sizeof(struct compat_xt_rateinfo),
+	.compat_from_user = limit_mt_compat_from_user,
+	.compat_to_user   = limit_mt_compat_to_user,
 #endif
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "limit",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= limit_mt_check,
-		.match		= limit_mt,
-		.matchsize	= sizeof(struct xt_rateinfo),
-#ifdef CONFIG_COMPAT
-		.compatsize	= sizeof(struct compat_xt_rateinfo),
-		.compat_from_user = limit_mt_compat_from_user,
-		.compat_to_user	= limit_mt_compat_to_user,
-#endif
-		.me		= THIS_MODULE,
-	},
+	.me               = THIS_MODULE,
 };
 
 static int __init limit_mt_init(void)
 {
-	return xt_register_matches(limit_mt_reg, ARRAY_SIZE(limit_mt_reg));
+	return xt_register_match(&limit_mt_reg);
 }
 
 static void __exit limit_mt_exit(void)
 {
-	xt_unregister_matches(limit_mt_reg, ARRAY_SIZE(limit_mt_reg));
+	xt_unregister_match(&limit_mt_reg);
 }
 
 module_init(limit_mt_init);
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index c66affd5722..96dd2b63b6b 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -92,7 +92,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = {
 	{
 		.name		= "mark",
 		.revision	= 0,
-		.family		= NFPROTO_IPV4,
+		.family		= NFPROTO_UNSPEC,
 		.checkentry	= mark_mt_check_v0,
 		.match		= mark_mt_v0,
 		.matchsize	= sizeof(struct xt_mark_info),
@@ -103,32 +103,10 @@ static struct xt_match mark_mt_reg[] __read_mostly = {
 #endif
 		.me		= THIS_MODULE,
 	},
-	{
-		.name		= "mark",
-		.revision	= 0,
-		.family		= NFPROTO_IPV6,
-		.checkentry	= mark_mt_check_v0,
-		.match		= mark_mt_v0,
-		.matchsize	= sizeof(struct xt_mark_info),
-#ifdef CONFIG_COMPAT
-		.compatsize	= sizeof(struct compat_xt_mark_info),
-		.compat_from_user = mark_mt_compat_from_user_v0,
-		.compat_to_user	= mark_mt_compat_to_user_v0,
-#endif
-		.me		= THIS_MODULE,
-	},
-	{
-		.name           = "mark",
-		.revision       = 1,
-		.family         = NFPROTO_IPV4,
-		.match          = mark_mt,
-		.matchsize      = sizeof(struct xt_mark_mtinfo1),
-		.me             = THIS_MODULE,
-	},
 	{
 		.name           = "mark",
 		.revision       = 1,
-		.family         = NFPROTO_IPV6,
+		.family         = NFPROTO_UNSPEC,
 		.match          = mark_mt,
 		.matchsize      = sizeof(struct xt_mark_mtinfo1),
 		.me             = THIS_MODULE,
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 59f61e32b62..a3c8798f0cc 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -54,33 +54,24 @@ quota_mt_check(const char *tablename, const void *entry,
 	return true;
 }
 
-static struct xt_match quota_mt_reg[] __read_mostly = {
-	{
-		.name		= "quota",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= quota_mt_check,
-		.match		= quota_mt,
-		.matchsize	= sizeof(struct xt_quota_info),
-		.me		= THIS_MODULE
-	},
-	{
-		.name		= "quota",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= quota_mt_check,
-		.match		= quota_mt,
-		.matchsize	= sizeof(struct xt_quota_info),
-		.me		= THIS_MODULE
-	},
+static struct xt_match quota_mt_reg __read_mostly = {
+	.name       = "quota",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = quota_mt,
+	.checkentry = quota_mt_check,
+	.matchsize  = sizeof(struct xt_quota_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init quota_mt_init(void)
 {
-	return xt_register_matches(quota_mt_reg, ARRAY_SIZE(quota_mt_reg));
+	return xt_register_match(&quota_mt_reg);
 }
 
 static void __exit quota_mt_exit(void)
 {
-	xt_unregister_matches(quota_mt_reg, ARRAY_SIZE(quota_mt_reg));
+	xt_unregister_match(&quota_mt_reg);
 }
 
 module_init(quota_mt_init);
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index ba1cb5760f4..4dcfd7353db 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -137,36 +137,25 @@ static void xt_rateest_mt_destroy(const struct xt_match *match,
 		xt_rateest_put(info->est2);
 }
 
-static struct xt_match xt_rateest_match[] __read_mostly = {
-	{
-		.family		= NFPROTO_IPV4,
-		.name		= "rateest",
-		.match		= xt_rateest_mt,
-		.checkentry	= xt_rateest_mt_checkentry,
-		.destroy	= xt_rateest_mt_destroy,
-		.matchsize	= sizeof(struct xt_rateest_match_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.family		= NFPROTO_IPV6,
-		.name		= "rateest",
-		.match		= xt_rateest_mt,
-		.checkentry	= xt_rateest_mt_checkentry,
-		.destroy	= xt_rateest_mt_destroy,
-		.matchsize	= sizeof(struct xt_rateest_match_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_match xt_rateest_mt_reg __read_mostly = {
+	.name       = "rateest",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = xt_rateest_mt,
+	.checkentry = xt_rateest_mt_checkentry,
+	.destroy    = xt_rateest_mt_destroy,
+	.matchsize  = sizeof(struct xt_rateest_match_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init xt_rateest_mt_init(void)
 {
-	return xt_register_matches(xt_rateest_match,
-				   ARRAY_SIZE(xt_rateest_match));
+	return xt_register_match(&xt_rateest_mt_reg);
 }
 
 static void __exit xt_rateest_mt_fini(void)
 {
-	xt_unregister_matches(xt_rateest_match, ARRAY_SIZE(xt_rateest_match));
+	xt_unregister_match(&xt_rateest_mt_reg);
 }
 
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index fd3bb1400df..f41a92322e6 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -66,35 +66,24 @@ statistic_mt_check(const char *tablename, const void *entry,
 	return true;
 }
 
-static struct xt_match statistic_mt_reg[] __read_mostly = {
-	{
-		.name		= "statistic",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= statistic_mt_check,
-		.match		= statistic_mt,
-		.matchsize	= sizeof(struct xt_statistic_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "statistic",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= statistic_mt_check,
-		.match		= statistic_mt,
-		.matchsize	= sizeof(struct xt_statistic_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_match xt_statistic_mt_reg __read_mostly = {
+	.name       = "statistic",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = statistic_mt,
+	.checkentry = statistic_mt_check,
+	.matchsize  = sizeof(struct xt_statistic_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init statistic_mt_init(void)
 {
-	return xt_register_matches(statistic_mt_reg,
-	       ARRAY_SIZE(statistic_mt_reg));
+	return xt_register_match(&xt_statistic_mt_reg);
 }
 
 static void __exit statistic_mt_exit(void)
 {
-	xt_unregister_matches(statistic_mt_reg,
-	                      ARRAY_SIZE(statistic_mt_reg));
+	xt_unregister_match(&xt_statistic_mt_reg);
 }
 
 module_init(statistic_mt_init);
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 50169718377..18d8884e737 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -81,11 +81,11 @@ static void string_mt_destroy(const struct xt_match *match, void *matchinfo)
 	textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
 }
 
-static struct xt_match string_mt_reg[] __read_mostly = {
+static struct xt_match xt_string_mt_reg[] __read_mostly = {
 	{
 		.name 		= "string",
 		.revision	= 0,
-		.family		= NFPROTO_IPV4,
+		.family		= NFPROTO_UNSPEC,
 		.checkentry	= string_mt_check,
 		.match 		= string_mt,
 		.destroy 	= string_mt_destroy,
@@ -95,27 +95,7 @@ static struct xt_match string_mt_reg[] __read_mostly = {
 	{
 		.name 		= "string",
 		.revision	= 1,
-		.family		= NFPROTO_IPV4,
-		.checkentry	= string_mt_check,
-		.match 		= string_mt,
-		.destroy 	= string_mt_destroy,
-		.matchsize	= sizeof(struct xt_string_info),
-		.me 		= THIS_MODULE
-	},
-	{
-		.name 		= "string",
-		.revision	= 0,
-		.family		= NFPROTO_IPV6,
-		.checkentry	= string_mt_check,
-		.match 		= string_mt,
-		.destroy 	= string_mt_destroy,
-		.matchsize	= sizeof(struct xt_string_info),
-		.me 		= THIS_MODULE
-	},
-	{
-		.name 		= "string",
-		.revision	= 1,
-		.family		= NFPROTO_IPV6,
+		.family		= NFPROTO_UNSPEC,
 		.checkentry	= string_mt_check,
 		.match 		= string_mt,
 		.destroy 	= string_mt_destroy,
@@ -126,12 +106,13 @@ static struct xt_match string_mt_reg[] __read_mostly = {
 
 static int __init string_mt_init(void)
 {
-	return xt_register_matches(string_mt_reg, ARRAY_SIZE(string_mt_reg));
+	return xt_register_matches(xt_string_mt_reg,
+				   ARRAY_SIZE(xt_string_mt_reg));
 }
 
 static void __exit string_mt_exit(void)
 {
-	xt_unregister_matches(string_mt_reg, ARRAY_SIZE(string_mt_reg));
+	xt_unregister_matches(xt_string_mt_reg, ARRAY_SIZE(xt_string_mt_reg));
 }
 
 module_init(string_mt_init);
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index fe9dae2b4f5..32d4c769caa 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -237,33 +237,23 @@ time_mt_check(const char *tablename, const void *ip,
 	return true;
 }
 
-static struct xt_match time_mt_reg[] __read_mostly = {
-	{
-		.name       = "time",
-		.family     = NFPROTO_IPV4,
-		.match      = time_mt,
-		.matchsize  = sizeof(struct xt_time_info),
-		.checkentry = time_mt_check,
-		.me         = THIS_MODULE,
-	},
-	{
-		.name       = "time",
-		.family     = NFPROTO_IPV6,
-		.match      = time_mt,
-		.matchsize  = sizeof(struct xt_time_info),
-		.checkentry = time_mt_check,
-		.me         = THIS_MODULE,
-	},
+static struct xt_match xt_time_mt_reg __read_mostly = {
+	.name       = "time",
+	.family     = NFPROTO_UNSPEC,
+	.match      = time_mt,
+	.checkentry = time_mt_check,
+	.matchsize  = sizeof(struct xt_time_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init time_mt_init(void)
 {
-	return xt_register_matches(time_mt_reg, ARRAY_SIZE(time_mt_reg));
+	return xt_register_match(&xt_time_mt_reg);
 }
 
 static void __exit time_mt_exit(void)
 {
-	xt_unregister_matches(time_mt_reg, ARRAY_SIZE(time_mt_reg));
+	xt_unregister_match(&xt_time_mt_reg);
 }
 
 module_init(time_mt_init);
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
index ed9f8340611..a6b971dc5d3 100644
--- a/net/netfilter/xt_u32.c
+++ b/net/netfilter/xt_u32.c
@@ -99,31 +99,23 @@ u32_mt(const struct sk_buff *skb, const struct net_device *in,
 	return ret ^ data->invert;
 }
 
-static struct xt_match u32_mt_reg[] __read_mostly = {
-	{
-		.name       = "u32",
-		.family     = NFPROTO_IPV4,
-		.match      = u32_mt,
-		.matchsize  = sizeof(struct xt_u32),
-		.me         = THIS_MODULE,
-	},
-	{
-		.name       = "u32",
-		.family     = NFPROTO_IPV6,
-		.match      = u32_mt,
-		.matchsize  = sizeof(struct xt_u32),
-		.me         = THIS_MODULE,
-	},
+static struct xt_match xt_u32_mt_reg __read_mostly = {
+	.name       = "u32",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = u32_mt,
+	.matchsize  = sizeof(struct xt_u32),
+	.me         = THIS_MODULE,
 };
 
 static int __init u32_mt_init(void)
 {
-	return xt_register_matches(u32_mt_reg, ARRAY_SIZE(u32_mt_reg));
+	return xt_register_match(&xt_u32_mt_reg);
 }
 
 static void __exit u32_mt_exit(void)
 {
-	xt_unregister_matches(u32_mt_reg, ARRAY_SIZE(u32_mt_reg));
+	xt_unregister_match(&xt_u32_mt_reg);
 }
 
 module_init(u32_mt_init);
-- 
cgit v1.2.3


From 48dc7865aa3db9404aedc8677d9daf8f8f469ab0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:01 +0200
Subject: netfilter: netns: remove nf_*_net() wrappers

Now that dev_net() exists, the usefullness of them is even less. Also they're
a big problem in resolving circular header dependencies necessary for
NOTRACK-in-netns patch. See below.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/iptable_filter.c    |  6 +++---
 net/ipv4/netfilter/iptable_mangle.c    | 10 +++++-----
 net/ipv4/netfilter/iptable_raw.c       |  4 ++--
 net/ipv4/netfilter/iptable_security.c  |  6 +++---
 net/ipv6/netfilter/ip6table_filter.c   |  6 +++---
 net/ipv6/netfilter/ip6table_security.c |  6 +++---
 6 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 1ea677dcf84..c9224310eba 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -70,7 +70,7 @@ ipt_local_in_hook(unsigned int hook,
 		  int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_local_in_net(in, out)->ipv4.iptable_filter);
+			    dev_net(in)->ipv4.iptable_filter);
 }
 
 static unsigned int
@@ -81,7 +81,7 @@ ipt_hook(unsigned int hook,
 	 int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_forward_net(in, out)->ipv4.iptable_filter);
+			    dev_net(in)->ipv4.iptable_filter);
 }
 
 static unsigned int
@@ -101,7 +101,7 @@ ipt_local_out_hook(unsigned int hook,
 	}
 
 	return ipt_do_table(skb, hook, in, out,
-			    nf_local_out_net(in, out)->ipv4.iptable_filter);
+			    dev_net(out)->ipv4.iptable_filter);
 }
 
 static struct nf_hook_ops ipt_ops[] __read_mostly = {
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index da59182f222..69f2c428714 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -81,7 +81,7 @@ ipt_pre_routing_hook(unsigned int hook,
 		     int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_pre_routing_net(in, out)->ipv4.iptable_mangle);
+			    dev_net(in)->ipv4.iptable_mangle);
 }
 
 static unsigned int
@@ -92,7 +92,7 @@ ipt_post_routing_hook(unsigned int hook,
 		      int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_post_routing_net(in, out)->ipv4.iptable_mangle);
+			    dev_net(out)->ipv4.iptable_mangle);
 }
 
 static unsigned int
@@ -103,7 +103,7 @@ ipt_local_in_hook(unsigned int hook,
 		  int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_local_in_net(in, out)->ipv4.iptable_mangle);
+			    dev_net(in)->ipv4.iptable_mangle);
 }
 
 static unsigned int
@@ -114,7 +114,7 @@ ipt_forward_hook(unsigned int hook,
 	 int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_forward_net(in, out)->ipv4.iptable_mangle);
+			    dev_net(in)->ipv4.iptable_mangle);
 }
 
 static unsigned int
@@ -147,7 +147,7 @@ ipt_local_hook(unsigned int hook,
 	tos = iph->tos;
 
 	ret = ipt_do_table(skb, hook, in, out,
-			   nf_local_out_net(in, out)->ipv4.iptable_mangle);
+			   dev_net(out)->ipv4.iptable_mangle);
 	/* Reroute for ANY change. */
 	if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
 		iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index fddce7754b7..8faebfe638f 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -53,7 +53,7 @@ ipt_hook(unsigned int hook,
 	 int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_pre_routing_net(in, out)->ipv4.iptable_raw);
+			    dev_net(in)->ipv4.iptable_raw);
 }
 
 static unsigned int
@@ -72,7 +72,7 @@ ipt_local_hook(unsigned int hook,
 		return NF_ACCEPT;
 	}
 	return ipt_do_table(skb, hook, in, out,
-			    nf_local_out_net(in, out)->ipv4.iptable_raw);
+			    dev_net(out)->ipv4.iptable_raw);
 }
 
 /* 'raw' is the very first table. */
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index db6d312128e..36f3be3cc42 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -73,7 +73,7 @@ ipt_local_in_hook(unsigned int hook,
 		  int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_local_in_net(in, out)->ipv4.iptable_security);
+			    dev_net(in)->ipv4.iptable_security);
 }
 
 static unsigned int
@@ -84,7 +84,7 @@ ipt_forward_hook(unsigned int hook,
 		 int (*okfn)(struct sk_buff *))
 {
 	return ipt_do_table(skb, hook, in, out,
-			    nf_forward_net(in, out)->ipv4.iptable_security);
+			    dev_net(in)->ipv4.iptable_security);
 }
 
 static unsigned int
@@ -103,7 +103,7 @@ ipt_local_out_hook(unsigned int hook,
 		return NF_ACCEPT;
 	}
 	return ipt_do_table(skb, hook, in, out,
-			    nf_local_out_net(in, out)->ipv4.iptable_security);
+			    dev_net(out)->ipv4.iptable_security);
 }
 
 static struct nf_hook_ops ipt_ops[] __read_mostly = {
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 55a2c290bad..b110a8a85a1 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -68,7 +68,7 @@ ip6t_local_in_hook(unsigned int hook,
 		   int (*okfn)(struct sk_buff *))
 {
 	return ip6t_do_table(skb, hook, in, out,
-			     nf_local_in_net(in, out)->ipv6.ip6table_filter);
+			     dev_net(in)->ipv6.ip6table_filter);
 }
 
 static unsigned int
@@ -79,7 +79,7 @@ ip6t_forward_hook(unsigned int hook,
 		  int (*okfn)(struct sk_buff *))
 {
 	return ip6t_do_table(skb, hook, in, out,
-			     nf_forward_net(in, out)->ipv6.ip6table_filter);
+			     dev_net(in)->ipv6.ip6table_filter);
 }
 
 static unsigned int
@@ -100,7 +100,7 @@ ip6t_local_out_hook(unsigned int hook,
 #endif
 
 	return ip6t_do_table(skb, hook, in, out,
-			     nf_local_out_net(in, out)->ipv6.ip6table_filter);
+			     dev_net(out)->ipv6.ip6table_filter);
 }
 
 static struct nf_hook_ops ip6t_ops[] __read_mostly = {
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 6e7131036bc..20bc52f13e4 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -72,7 +72,7 @@ ip6t_local_in_hook(unsigned int hook,
 		   int (*okfn)(struct sk_buff *))
 {
 	return ip6t_do_table(skb, hook, in, out,
-			     nf_local_in_net(in, out)->ipv6.ip6table_security);
+			     dev_net(in)->ipv6.ip6table_security);
 }
 
 static unsigned int
@@ -83,7 +83,7 @@ ip6t_forward_hook(unsigned int hook,
 		  int (*okfn)(struct sk_buff *))
 {
 	return ip6t_do_table(skb, hook, in, out,
-			     nf_forward_net(in, out)->ipv6.ip6table_security);
+			     dev_net(in)->ipv6.ip6table_security);
 }
 
 static unsigned int
@@ -95,7 +95,7 @@ ip6t_local_out_hook(unsigned int hook,
 {
 	/* TBD: handle short packets via raw socket */
 	return ip6t_do_table(skb, hook, in, out,
-			     nf_local_out_net(in, out)->ipv6.ip6table_security);
+			     dev_net(out)->ipv6.ip6table_security);
 }
 
 static struct nf_hook_ops ip6t_ops[] __read_mostly = {
-- 
cgit v1.2.3


From 1339dd91719f3e841b113ddaccd30fd87b9d2332 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:01 +0200
Subject: netfilter: netns: ip6table_raw in netns for real

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/ip6table_raw.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 92b91077ac2..109fab6f831 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -45,25 +45,37 @@ static struct xt_table packet_raw = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6t_hook(unsigned int hook,
+ip6t_pre_routing_hook(unsigned int hook,
 	 struct sk_buff *skb,
 	 const struct net_device *in,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_raw);
+	return ip6t_do_table(skb, hook, in, out,
+			     dev_net(in)->ipv6.ip6table_raw);
+}
+
+static unsigned int
+ip6t_local_out_hook(unsigned int hook,
+	 struct sk_buff *skb,
+	 const struct net_device *in,
+	 const struct net_device *out,
+	 int (*okfn)(struct sk_buff *))
+{
+	return ip6t_do_table(skb, hook, in, out,
+			     dev_net(out)->ipv6.ip6table_raw);
 }
 
 static struct nf_hook_ops ip6t_ops[] __read_mostly = {
 	{
-	  .hook = ip6t_hook,
+	  .hook = ip6t_pre_routing_hook,
 	  .pf = PF_INET6,
 	  .hooknum = NF_INET_PRE_ROUTING,
 	  .priority = NF_IP6_PRI_FIRST,
 	  .owner = THIS_MODULE,
 	},
 	{
-	  .hook = ip6t_hook,
+	  .hook = ip6t_local_out_hook,
 	  .pf = PF_INET6,
 	  .hooknum = NF_INET_LOCAL_OUT,
 	  .priority = NF_IP6_PRI_FIRST,
-- 
cgit v1.2.3


From 7dd1b8dad84c9561fe8949ed5db4de15aee877eb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:02 +0200
Subject: netfilter: netns: ip6table_mangle in netns for real

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/ip6table_mangle.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index f405cea21a8..d0b31b259d4 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -67,17 +67,29 @@ static struct xt_table packet_mangler = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6t_route_hook(unsigned int hook,
+ip6t_in_hook(unsigned int hook,
 	 struct sk_buff *skb,
 	 const struct net_device *in,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_mangle);
+	return ip6t_do_table(skb, hook, in, out,
+			     dev_net(in)->ipv6.ip6table_mangle);
 }
 
 static unsigned int
-ip6t_local_hook(unsigned int hook,
+ip6t_post_routing_hook(unsigned int hook,
+		struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		int (*okfn)(struct sk_buff *))
+{
+	return ip6t_do_table(skb, hook, in, out,
+			     dev_net(out)->ipv6.ip6table_mangle);
+}
+
+static unsigned int
+ip6t_local_out_hook(unsigned int hook,
 		   struct sk_buff *skb,
 		   const struct net_device *in,
 		   const struct net_device *out,
@@ -108,7 +120,8 @@ ip6t_local_hook(unsigned int hook,
 	/* flowlabel and prio (includes version, which shouldn't change either */
 	flowlabel = *((u_int32_t *)ipv6_hdr(skb));
 
-	ret = ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_mangle);
+	ret = ip6t_do_table(skb, hook, in, out,
+			    dev_net(out)->ipv6.ip6table_mangle);
 
 	if (ret != NF_DROP && ret != NF_STOLEN
 		&& (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr))
@@ -122,35 +135,35 @@ ip6t_local_hook(unsigned int hook,
 
 static struct nf_hook_ops ip6t_ops[] __read_mostly = {
 	{
-		.hook		= ip6t_route_hook,
+		.hook		= ip6t_in_hook,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP6_PRI_MANGLE,
 	},
 	{
-		.hook		= ip6t_route_hook,
+		.hook		= ip6t_in_hook,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP6_PRI_MANGLE,
 	},
 	{
-		.hook		= ip6t_route_hook,
+		.hook		= ip6t_in_hook,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
 		.hooknum	= NF_INET_FORWARD,
 		.priority	= NF_IP6_PRI_MANGLE,
 	},
 	{
-		.hook		= ip6t_local_hook,
+		.hook		= ip6t_local_out_hook,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP6_PRI_MANGLE,
 	},
 	{
-		.hook		= ip6t_route_hook,
+		.hook		= ip6t_post_routing_hook,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
 		.hooknum	= NF_INET_POST_ROUTING,
-- 
cgit v1.2.3


From e10aad9998e463df8e25ec749538faf3324dd31b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:02 +0200
Subject: netfilter: netns: ip6t_REJECT in netns for real

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/ip6t_REJECT.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 672ad9ff3e2..f1a9fce1ec9 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -35,7 +35,7 @@ MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
 MODULE_LICENSE("GPL");
 
 /* Send RST reply */
-static void send_reset(struct sk_buff *oldskb)
+static void send_reset(struct net *net, struct sk_buff *oldskb)
 {
 	struct sk_buff *nskb;
 	struct tcphdr otcph, *tcph;
@@ -94,7 +94,7 @@ static void send_reset(struct sk_buff *oldskb)
 	fl.fl_ip_sport = otcph.dest;
 	fl.fl_ip_dport = otcph.source;
 	security_skb_classify_flow(oldskb, &fl);
-	dst = ip6_route_output(&init_net, NULL, &fl);
+	dst = ip6_route_output(net, NULL, &fl);
 	if (dst == NULL)
 		return;
 	if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0))
@@ -163,10 +163,11 @@ static void send_reset(struct sk_buff *oldskb)
 }
 
 static inline void
-send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum)
+send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code,
+	     unsigned int hooknum)
 {
 	if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
-		skb_in->dev = init_net.loopback_dev;
+		skb_in->dev = net->loopback_dev;
 
 	icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL);
 }
@@ -177,6 +178,7 @@ reject_tg6(struct sk_buff *skb, const struct net_device *in,
            const struct xt_target *target, const void *targinfo)
 {
 	const struct ip6t_reject_info *reject = targinfo;
+	struct net *net = dev_net(in ? in : out);
 
 	pr_debug("%s: medium point\n", __func__);
 	/* WARNING: This code causes reentry within ip6tables.
@@ -184,25 +186,25 @@ reject_tg6(struct sk_buff *skb, const struct net_device *in,
 	   must return an absolute verdict. --RR */
 	switch (reject->with) {
 	case IP6T_ICMP6_NO_ROUTE:
-		send_unreach(skb, ICMPV6_NOROUTE, hooknum);
+		send_unreach(net, skb, ICMPV6_NOROUTE, hooknum);
 		break;
 	case IP6T_ICMP6_ADM_PROHIBITED:
-		send_unreach(skb, ICMPV6_ADM_PROHIBITED, hooknum);
+		send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, hooknum);
 		break;
 	case IP6T_ICMP6_NOT_NEIGHBOUR:
-		send_unreach(skb, ICMPV6_NOT_NEIGHBOUR, hooknum);
+		send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, hooknum);
 		break;
 	case IP6T_ICMP6_ADDR_UNREACH:
-		send_unreach(skb, ICMPV6_ADDR_UNREACH, hooknum);
+		send_unreach(net, skb, ICMPV6_ADDR_UNREACH, hooknum);
 		break;
 	case IP6T_ICMP6_PORT_UNREACH:
-		send_unreach(skb, ICMPV6_PORT_UNREACH, hooknum);
+		send_unreach(net, skb, ICMPV6_PORT_UNREACH, hooknum);
 		break;
 	case IP6T_ICMP6_ECHOREPLY:
 		/* Do nothing */
 		break;
 	case IP6T_TCP_RESET:
-		send_reset(skb);
+		send_reset(net, skb);
 		break;
 	default:
 		if (net_ratelimit())
-- 
cgit v1.2.3


From dfdb8d791877052bbb527d9688d94a064721d8f7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:02 +0200
Subject: netfilter: netns nf_conntrack: add netns boilerplate

One comment: #ifdefs around #include is necessary to overcome amazing compile
breakages in NOTRACK-in-netns patch (see below).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c       |  4 ++--
 net/netfilter/nf_conntrack_expect.c     |  4 ++--
 net/netfilter/nf_conntrack_standalone.c | 21 ++++++++++++++++++---
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 6aaf64b5ded..ee79e932589 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1006,7 +1006,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_flush);
 
 /* Mishearing the voices in his head, our hero wonders how he's
    supposed to kill the mall. */
-void nf_conntrack_cleanup(void)
+void nf_conntrack_cleanup(struct net *net)
 {
 	rcu_assign_pointer(ip_ct_attach, NULL);
 
@@ -1120,7 +1120,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
-int __init nf_conntrack_init(void)
+int nf_conntrack_init(struct net *net)
 {
 	int max_factor = 8;
 	int ret;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 990fa12f2ee..e6a79f2a7c5 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -537,7 +537,7 @@ static const struct file_operations exp_file_ops = {
 };
 #endif /* CONFIG_PROC_FS */
 
-static int __init exp_proc_init(void)
+static int exp_proc_init(void)
 {
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *proc;
@@ -558,7 +558,7 @@ static void exp_proc_remove(void)
 
 module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600);
 
-int __init nf_conntrack_expect_init(void)
+int nf_conntrack_expect_init(void)
 {
 	int err = -ENOMEM;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 8509db14670..81dec17196d 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -440,11 +440,26 @@ static void nf_conntrack_standalone_fini_sysctl(void)
 }
 #endif /* CONFIG_SYSCTL */
 
+static int nf_conntrack_net_init(struct net *net)
+{
+	return nf_conntrack_init(net);
+}
+
+static void nf_conntrack_net_exit(struct net *net)
+{
+	nf_conntrack_cleanup(net);
+}
+
+static struct pernet_operations nf_conntrack_net_ops = {
+	.init = nf_conntrack_net_init,
+	.exit = nf_conntrack_net_exit,
+};
+
 static int __init nf_conntrack_standalone_init(void)
 {
 	int ret;
 
-	ret = nf_conntrack_init();
+	ret = register_pernet_subsys(&nf_conntrack_net_ops);
 	if (ret < 0)
 		goto out;
 	ret = nf_conntrack_standalone_init_proc();
@@ -458,7 +473,7 @@ static int __init nf_conntrack_standalone_init(void)
 out_sysctl:
 	nf_conntrack_standalone_fini_proc();
 out_proc:
-	nf_conntrack_cleanup();
+	unregister_pernet_subsys(&nf_conntrack_net_ops);
 out:
 	return ret;
 }
@@ -467,7 +482,7 @@ static void __exit nf_conntrack_standalone_fini(void)
 {
 	nf_conntrack_standalone_fini_sysctl();
 	nf_conntrack_standalone_fini_proc();
-	nf_conntrack_cleanup();
+	unregister_pernet_subsys(&nf_conntrack_net_ops);
 }
 
 module_init(nf_conntrack_standalone_init);
-- 
cgit v1.2.3


From 5a1fb391d881905e89623d78858d05b248cbc86a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:02 +0200
Subject: netfilter: netns nf_conntrack: add ->ct_net -- pointer from conntrack
 to netns

Conntrack (struct nf_conn) gets pointer to netns: ->ct_net -- netns in which
it was created. It comes from netdevice.

->ct_net is write-once field.

Every conntrack in system has ->ct_net initialized, no exceptions.

->ct_net doesn't pin netns: conntracks are recycled after timeouts and
pinning background traffic will prevent netns from even starting shutdown
sequence.

Right now every conntrack is created in init_net.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c    | 17 +++++++++++++----
 net/netfilter/nf_conntrack_netlink.c |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ee79e932589..cefc338f6e5 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -464,7 +464,8 @@ static noinline int early_drop(unsigned int hash)
 	return dropped;
 }
 
-struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+struct nf_conn *nf_conntrack_alloc(struct net *net,
+				   const struct nf_conntrack_tuple *orig,
 				   const struct nf_conntrack_tuple *repl,
 				   gfp_t gfp)
 {
@@ -503,6 +504,9 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
 	/* Don't set timer yet: wait for confirmation */
 	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
+#ifdef CONFIG_NET_NS
+	ct->ct_net = net;
+#endif
 	INIT_RCU_HEAD(&ct->rcu);
 
 	return ct;
@@ -528,7 +532,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
 /* Allocate a new conntrack: we return -ENOMEM if classification
    failed due to stress.  Otherwise it really is unclassifiable. */
 static struct nf_conntrack_tuple_hash *
-init_conntrack(const struct nf_conntrack_tuple *tuple,
+init_conntrack(struct net *net,
+	       const struct nf_conntrack_tuple *tuple,
 	       struct nf_conntrack_l3proto *l3proto,
 	       struct nf_conntrack_l4proto *l4proto,
 	       struct sk_buff *skb,
@@ -544,7 +549,7 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
 		return NULL;
 	}
 
-	ct = nf_conntrack_alloc(tuple, &repl_tuple, GFP_ATOMIC);
+	ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC);
 	if (ct == NULL || IS_ERR(ct)) {
 		pr_debug("Can't allocate conntrack.\n");
 		return (struct nf_conntrack_tuple_hash *)ct;
@@ -631,7 +636,8 @@ resolve_normal_ct(struct sk_buff *skb,
 	/* look for tuple match */
 	h = nf_conntrack_find_get(&tuple);
 	if (!h) {
-		h = init_conntrack(&tuple, l3proto, l4proto, skb, dataoff);
+		h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb,
+				   dataoff);
 		if (!h)
 			return NULL;
 		if (IS_ERR(h))
@@ -1185,6 +1191,9 @@ int nf_conntrack_init(struct net *net)
 
 	/* Set up fake conntrack:
 	    - to never be deleted, not in any hashes */
+#ifdef CONFIG_NET_NS
+	nf_conntrack_untracked.ct_net = &init_net;
+#endif
 	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
 	/*  - and look it like as a confirmed connection */
 	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index a8752031adc..da3cdc8db70 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1125,7 +1125,7 @@ ctnetlink_create_conntrack(struct nlattr *cda[],
 	struct nf_conn_help *help;
 	struct nf_conntrack_helper *helper;
 
-	ct = nf_conntrack_alloc(otuple, rtuple, GFP_KERNEL);
+	ct = nf_conntrack_alloc(&init_net, otuple, rtuple, GFP_KERNEL);
 	if (ct == NULL || IS_ERR(ct))
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 49ac8713b6d064adf7474080fdccebd7cce76be0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:03 +0200
Subject: netfilter: netns nf_conntrack: per-netns conntrack count

Sysctls and proc files are stubbed to init_net's one. This is temporary.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c        |  2 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c |  2 +-
 net/netfilter/nf_conntrack_core.c                     | 18 ++++++++----------
 net/netfilter/nf_conntrack_standalone.c               |  4 ++--
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5a955c44036..31abee3e29f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -254,7 +254,7 @@ static ctl_table ip_ct_sysctl_table[] = {
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_COUNT,
 		.procname	= "ip_conntrack_count",
-		.data		= &nf_conntrack_count,
+		.data		= &init_net.ct.count,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 3a020720e40..4556805027f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -314,7 +314,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
 
 static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 {
-	unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
+	unsigned int nr_conntracks = atomic_read(&init_net.ct.count);
 	const struct ip_conntrack_stat *st = v;
 
 	if (v == SEQ_START_TOKEN) {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index cefc338f6e5..8299b3490e7 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -44,10 +44,6 @@
 DEFINE_SPINLOCK(nf_conntrack_lock);
 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 
-/* nf_conntrack_standalone needs this */
-atomic_t nf_conntrack_count = ATOMIC_INIT(0);
-EXPORT_SYMBOL_GPL(nf_conntrack_count);
-
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
@@ -477,13 +473,13 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
 	}
 
 	/* We don't want any race condition at early drop stage */
-	atomic_inc(&nf_conntrack_count);
+	atomic_inc(&net->ct.count);
 
 	if (nf_conntrack_max &&
-	    unlikely(atomic_read(&nf_conntrack_count) > nf_conntrack_max)) {
+	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
 		unsigned int hash = hash_conntrack(orig);
 		if (!early_drop(hash)) {
-			atomic_dec(&nf_conntrack_count);
+			atomic_dec(&net->ct.count);
 			if (net_ratelimit())
 				printk(KERN_WARNING
 				       "nf_conntrack: table full, dropping"
@@ -495,7 +491,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
 	ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp);
 	if (ct == NULL) {
 		pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
-		atomic_dec(&nf_conntrack_count);
+		atomic_dec(&net->ct.count);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -516,10 +512,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 static void nf_conntrack_free_rcu(struct rcu_head *head)
 {
 	struct nf_conn *ct = container_of(head, struct nf_conn, rcu);
+	struct net *net = nf_ct_net(ct);
 
 	nf_ct_ext_free(ct);
 	kmem_cache_free(nf_conntrack_cachep, ct);
-	atomic_dec(&nf_conntrack_count);
+	atomic_dec(&net->ct.count);
 }
 
 void nf_conntrack_free(struct nf_conn *ct)
@@ -1024,7 +1021,7 @@ void nf_conntrack_cleanup(struct net *net)
 	nf_ct_event_cache_flush();
  i_see_dead_people:
 	nf_conntrack_flush();
-	if (atomic_read(&nf_conntrack_count) != 0) {
+	if (atomic_read(&net->ct.count) != 0) {
 		schedule();
 		goto i_see_dead_people;
 	}
@@ -1148,6 +1145,7 @@ int nf_conntrack_init(struct net *net)
 		 * entries. */
 		max_factor = 4;
 	}
+	atomic_set(&net->ct.count, 0);
 	nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
 						  &nf_conntrack_vmalloc);
 	if (!nf_conntrack_hash) {
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 81dec17196d..021b505907d 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -226,7 +226,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
 
 static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 {
-	unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
+	unsigned int nr_conntracks = atomic_read(&init_net.ct.count);
 	const struct ip_conntrack_stat *st = v;
 
 	if (v == SEQ_START_TOKEN) {
@@ -338,7 +338,7 @@ static ctl_table nf_ct_sysctl_table[] = {
 	{
 		.ctl_name	= NET_NF_CONNTRACK_COUNT,
 		.procname	= "nf_conntrack_count",
-		.data		= &nf_conntrack_count,
+		.data		= &init_net.ct.count,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
-- 
cgit v1.2.3


From 400dad39d1c33fe797e47326d87a3f54d0ac5181 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:03 +0200
Subject: netfilter: netns nf_conntrack: per-netns conntrack hash

* make per-netns conntrack hash

  Other solution is to add ->ct_net pointer to tuplehashes and still has one
  hash, I tried that it's ugly and requires more code deep down in protocol
  modules et al.

* propagate netns pointer to where needed, e. g. to conntrack iterators.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_MASQUERADE.c                |  3 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c     |  2 +-
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c   |  4 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c       |  2 +-
 net/ipv4/netfilter/nf_nat_core.c                   |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c     |  2 +-
 net/netfilter/nf_conntrack_core.c                  | 74 +++++++++++-----------
 net/netfilter/nf_conntrack_helper.c                |  2 +-
 net/netfilter/nf_conntrack_netlink.c               | 16 ++---
 net/netfilter/nf_conntrack_pptp.c                  |  2 +-
 net/netfilter/nf_conntrack_proto.c                 |  4 +-
 net/netfilter/nf_conntrack_standalone.c            |  4 +-
 net/netfilter/xt_connlimit.c                       |  2 +-
 13 files changed, 61 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 9a4822f8243..5e1c81791e5 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -129,7 +129,8 @@ static int masq_device_event(struct notifier_block *this,
 		   and forget them. */
 		NF_CT_ASSERT(dev->ifindex != 0);
 
-		nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
+		nf_ct_iterate_cleanup(&init_net, device_cmp,
+				      (void *)(long)dev->ifindex);
 	}
 
 	return NOTIFY_DONE;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 31abee3e29f..03dd108015c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -323,7 +323,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 		return -EINVAL;
 	}
 
-	h = nf_conntrack_find_get(&tuple);
+	h = nf_conntrack_find_get(sock_net(sk), &tuple);
 	if (h) {
 		struct sockaddr_in sin;
 		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 4556805027f..8e0afdc2b13 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -32,7 +32,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq)
 	for (st->bucket = 0;
 	     st->bucket < nf_conntrack_htable_size;
 	     st->bucket++) {
-		n = rcu_dereference(nf_conntrack_hash[st->bucket].first);
+		n = rcu_dereference(init_net.ct.hash[st->bucket].first);
 		if (n)
 			return n;
 	}
@@ -48,7 +48,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq,
 	while (head == NULL) {
 		if (++st->bucket >= nf_conntrack_htable_size)
 			return NULL;
-		head = rcu_dereference(nf_conntrack_hash[st->bucket].first);
+		head = rcu_dereference(init_net.ct.hash[st->bucket].first);
 	}
 	return head;
 }
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index da8edcdaef3..daf346377b6 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -155,7 +155,7 @@ icmp_error_message(struct sk_buff *skb,
 
 	*ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(&innertuple);
+	h = nf_conntrack_find_get(&init_net, &innertuple);
 	if (!h) {
 		pr_debug("icmp_error_message: no match\n");
 		return -NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 6c6a3cba8d5..5d4a5b70da2 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -643,7 +643,7 @@ static int clean_nat(struct nf_conn *i, void *data)
 
 static void __exit nf_nat_cleanup(void)
 {
-	nf_ct_iterate_cleanup(&clean_nat, NULL);
+	nf_ct_iterate_cleanup(&init_net, &clean_nat, NULL);
 	synchronize_rcu();
 	nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size);
 	nf_ct_l3proto_put(l3proto);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 5756f30ebc6..548cf4f15c0 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -156,7 +156,7 @@ icmpv6_error_message(struct sk_buff *skb,
 
 	*ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(&intuple);
+	h = nf_conntrack_find_get(&init_net, &intuple);
 	if (!h) {
 		pr_debug("icmpv6_error: no match\n");
 		return -NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8299b3490e7..da56b260552 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -50,15 +50,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 int nf_conntrack_max __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_max);
 
-struct hlist_head *nf_conntrack_hash __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_hash);
-
 struct nf_conn nf_conntrack_untracked __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 
 unsigned int nf_ct_log_invalid __read_mostly;
 HLIST_HEAD(unconfirmed);
-static int nf_conntrack_vmalloc __read_mostly;
 static struct kmem_cache *nf_conntrack_cachep __read_mostly;
 
 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
@@ -242,7 +238,7 @@ static void death_by_timeout(unsigned long ul_conntrack)
 }
 
 struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(const struct nf_conntrack_tuple *tuple)
+__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_node *n;
@@ -252,7 +248,7 @@ __nf_conntrack_find(const struct nf_conntrack_tuple *tuple)
 	 * at least once for the stats anyway.
 	 */
 	local_bh_disable();
-	hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) {
+	hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) {
 		if (nf_ct_tuple_equal(tuple, &h->tuple)) {
 			NF_CT_STAT_INC(found);
 			local_bh_enable();
@@ -268,13 +264,13 @@ EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 
 /* Find a connection corresponding to a tuple. */
 struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple)
+nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 
 	rcu_read_lock();
-	h = __nf_conntrack_find(tuple);
+	h = __nf_conntrack_find(net, tuple);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
@@ -290,10 +286,12 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 				       unsigned int hash,
 				       unsigned int repl_hash)
 {
+	struct net *net = nf_ct_net(ct);
+
 	hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
-			   &nf_conntrack_hash[hash]);
+			   &net->ct.hash[hash]);
 	hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode,
-			   &nf_conntrack_hash[repl_hash]);
+			   &net->ct.hash[repl_hash]);
 }
 
 void nf_conntrack_hash_insert(struct nf_conn *ct)
@@ -319,8 +317,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	struct nf_conn_help *help;
 	struct hlist_node *n;
 	enum ip_conntrack_info ctinfo;
+	struct net *net;
 
 	ct = nf_ct_get(skb, &ctinfo);
+	net = nf_ct_net(ct);
 
 	/* ipt_REJECT uses nf_conntrack_attach to attach related
 	   ICMP/TCP RST packets in other direction.  Actual packet
@@ -347,11 +347,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	/* See if there's one in the list already, including reverse:
 	   NAT could have grabbed it without realizing, since we're
 	   not in the hash.  If there is, we lost race. */
-	hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode)
+	hlist_for_each_entry(h, n, &net->ct.hash[hash], hnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				      &h->tuple))
 			goto out;
-	hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode)
+	hlist_for_each_entry(h, n, &net->ct.hash[repl_hash], hnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				      &h->tuple))
 			goto out;
@@ -394,6 +394,7 @@ int
 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			 const struct nf_conn *ignored_conntrack)
 {
+	struct net *net = nf_ct_net(ignored_conntrack);
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_node *n;
 	unsigned int hash = hash_conntrack(tuple);
@@ -402,7 +403,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 	 * least once for the stats anyway.
 	 */
 	rcu_read_lock_bh();
-	hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) {
+	hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) {
 		if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
 		    nf_ct_tuple_equal(tuple, &h->tuple)) {
 			NF_CT_STAT_INC(found);
@@ -421,7 +422,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 
 /* There's a small race here where we may free a just-assured
    connection.  Too bad: we're in trouble anyway. */
-static noinline int early_drop(unsigned int hash)
+static noinline int early_drop(struct net *net, unsigned int hash)
 {
 	/* Use oldest entry, which is roughly LRU */
 	struct nf_conntrack_tuple_hash *h;
@@ -432,7 +433,7 @@ static noinline int early_drop(unsigned int hash)
 
 	rcu_read_lock();
 	for (i = 0; i < nf_conntrack_htable_size; i++) {
-		hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
+		hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash],
 					 hnode) {
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 			if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
@@ -478,7 +479,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
 	if (nf_conntrack_max &&
 	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
 		unsigned int hash = hash_conntrack(orig);
-		if (!early_drop(hash)) {
+		if (!early_drop(net, hash)) {
 			atomic_dec(&net->ct.count);
 			if (net_ratelimit())
 				printk(KERN_WARNING
@@ -631,7 +632,7 @@ resolve_normal_ct(struct sk_buff *skb,
 	}
 
 	/* look for tuple match */
-	h = nf_conntrack_find_get(&tuple);
+	h = nf_conntrack_find_get(&init_net, &tuple);
 	if (!h) {
 		h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb,
 				   dataoff);
@@ -941,7 +942,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 
 /* Bring out ya dead! */
 static struct nf_conn *
-get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
+get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 		void *data, unsigned int *bucket)
 {
 	struct nf_conntrack_tuple_hash *h;
@@ -950,7 +951,7 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
 
 	spin_lock_bh(&nf_conntrack_lock);
 	for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
-		hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) {
+		hlist_for_each_entry(h, n, &net->ct.hash[*bucket], hnode) {
 			ct = nf_ct_tuplehash_to_ctrack(h);
 			if (iter(ct, data))
 				goto found;
@@ -969,13 +970,14 @@ found:
 	return ct;
 }
 
-void
-nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
+void nf_ct_iterate_cleanup(struct net *net,
+			   int (*iter)(struct nf_conn *i, void *data),
+			   void *data)
 {
 	struct nf_conn *ct;
 	unsigned int bucket = 0;
 
-	while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
+	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
 		/* Time to push up daises... */
 		if (del_timer(&ct->timeout))
 			death_by_timeout((unsigned long)ct);
@@ -1001,9 +1003,9 @@ void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, unsigned int s
 }
 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
 
-void nf_conntrack_flush(void)
+void nf_conntrack_flush(struct net *net)
 {
-	nf_ct_iterate_cleanup(kill_all, NULL);
+	nf_ct_iterate_cleanup(net, kill_all, NULL);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush);
 
@@ -1020,7 +1022,7 @@ void nf_conntrack_cleanup(struct net *net)
 
 	nf_ct_event_cache_flush();
  i_see_dead_people:
-	nf_conntrack_flush();
+	nf_conntrack_flush(net);
 	if (atomic_read(&net->ct.count) != 0) {
 		schedule();
 		goto i_see_dead_people;
@@ -1032,7 +1034,7 @@ void nf_conntrack_cleanup(struct net *net)
 	rcu_assign_pointer(nf_ct_destroy, NULL);
 
 	kmem_cache_destroy(nf_conntrack_cachep);
-	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc,
+	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
 
 	nf_conntrack_acct_fini();
@@ -1097,8 +1099,8 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 	 */
 	spin_lock_bh(&nf_conntrack_lock);
 	for (i = 0; i < nf_conntrack_htable_size; i++) {
-		while (!hlist_empty(&nf_conntrack_hash[i])) {
-			h = hlist_entry(nf_conntrack_hash[i].first,
+		while (!hlist_empty(&init_net.ct.hash[i])) {
+			h = hlist_entry(init_net.ct.hash[i].first,
 					struct nf_conntrack_tuple_hash, hnode);
 			hlist_del_rcu(&h->hnode);
 			bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
@@ -1106,12 +1108,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 		}
 	}
 	old_size = nf_conntrack_htable_size;
-	old_vmalloced = nf_conntrack_vmalloc;
-	old_hash = nf_conntrack_hash;
+	old_vmalloced = init_net.ct.hash_vmalloc;
+	old_hash = init_net.ct.hash;
 
 	nf_conntrack_htable_size = hashsize;
-	nf_conntrack_vmalloc = vmalloced;
-	nf_conntrack_hash = hash;
+	init_net.ct.hash_vmalloc = vmalloced;
+	init_net.ct.hash = hash;
 	nf_conntrack_hash_rnd = rnd;
 	spin_unlock_bh(&nf_conntrack_lock);
 
@@ -1146,9 +1148,9 @@ int nf_conntrack_init(struct net *net)
 		max_factor = 4;
 	}
 	atomic_set(&net->ct.count, 0);
-	nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
-						  &nf_conntrack_vmalloc);
-	if (!nf_conntrack_hash) {
+	net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
+						  &net->ct.hash_vmalloc);
+	if (!net->ct.hash) {
 		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
 		goto err_out;
 	}
@@ -1207,7 +1209,7 @@ out_fini_proto:
 err_free_conntrack_slab:
 	kmem_cache_destroy(nf_conntrack_cachep);
 err_free_hash:
-	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc,
+	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
 err_out:
 	return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 8e0b4c8f62a..d91278dfdaf 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -159,7 +159,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 	hlist_for_each_entry(h, n, &unconfirmed, hnode)
 		unhelp(h, me);
 	for (i = 0; i < nf_conntrack_htable_size; i++) {
-		hlist_for_each_entry(h, n, &nf_conntrack_hash[i], hnode)
+		hlist_for_each_entry(h, n, &init_net.ct.hash[i], hnode)
 			unhelp(h, me);
 	}
 	spin_unlock_bh(&nf_conntrack_lock);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index da3cdc8db70..918a3358a12 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -549,7 +549,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 	last = (struct nf_conn *)cb->args[1];
 	for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
 restart:
-		hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[cb->args[0]],
+		hlist_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]],
 					 hnode) {
 			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
 				continue;
@@ -794,14 +794,14 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
 	else {
 		/* Flush the whole table */
-		nf_conntrack_flush();
+		nf_conntrack_flush(&init_net);
 		return 0;
 	}
 
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(&tuple);
+	h = nf_conntrack_find_get(&init_net, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -847,7 +847,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(&tuple);
+	h = nf_conntrack_find_get(&init_net, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -1213,9 +1213,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 
 	spin_lock_bh(&nf_conntrack_lock);
 	if (cda[CTA_TUPLE_ORIG])
-		h = __nf_conntrack_find(&otuple);
+		h = __nf_conntrack_find(&init_net, &otuple);
 	else if (cda[CTA_TUPLE_REPLY])
-		h = __nf_conntrack_find(&rtuple);
+		h = __nf_conntrack_find(&init_net, &rtuple);
 
 	if (h == NULL) {
 		struct nf_conntrack_tuple master;
@@ -1230,7 +1230,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 			if (err < 0)
 				goto out_unlock;
 
-			master_h = __nf_conntrack_find(&master);
+			master_h = __nf_conntrack_find(&init_net, &master);
 			if (master_h == NULL) {
 				err = -ENOENT;
 				goto out_unlock;
@@ -1670,7 +1670,7 @@ ctnetlink_create_expect(struct nlattr *cda[], u_int8_t u3)
 		return err;
 
 	/* Look for master conntrack of this expectation */
-	h = nf_conntrack_find_get(&master_tuple);
+	h = nf_conntrack_find_get(&init_net, &master_tuple);
 	if (!h)
 		return -ENOENT;
 	ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 97e54b0e43a..7caf45b59d2 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -143,7 +143,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
 	pr_debug("trying to timeout ct or exp for tuple ");
 	nf_ct_dump_tuple(t);
 
-	h = nf_conntrack_find_get(t);
+	h = nf_conntrack_find_get(&init_net, t);
 	if (h)  {
 		sibling = nf_ct_tuplehash_to_ctrack(h);
 		pr_debug("setting timeout of conntrack %p to 0\n", sibling);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index a49fc932629..3a2f7ef997f 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -219,7 +219,7 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 	synchronize_rcu();
 
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_cleanup(kill_l3proto, proto);
+	nf_ct_iterate_cleanup(&init_net, kill_l3proto, proto);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
 
@@ -328,7 +328,7 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
 	synchronize_rcu();
 
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_cleanup(kill_l4proto, l4proto);
+	nf_ct_iterate_cleanup(&init_net, kill_l4proto, l4proto);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister);
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 021b505907d..5456e4b9424 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -51,7 +51,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq)
 	for (st->bucket = 0;
 	     st->bucket < nf_conntrack_htable_size;
 	     st->bucket++) {
-		n = rcu_dereference(nf_conntrack_hash[st->bucket].first);
+		n = rcu_dereference(init_net.ct.hash[st->bucket].first);
 		if (n)
 			return n;
 	}
@@ -67,7 +67,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq,
 	while (head == NULL) {
 		if (++st->bucket >= nf_conntrack_htable_size)
 			return NULL;
-		head = rcu_dereference(nf_conntrack_hash[st->bucket].first);
+		head = rcu_dereference(init_net.ct.hash[st->bucket].first);
 	}
 	return head;
 }
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index d2453d182d6..bd00830ff69 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -123,7 +123,7 @@ static int count_them(struct xt_connlimit_data *data,
 
 	/* check the saved connections */
 	list_for_each_entry_safe(conn, tmp, hash, list) {
-		found    = __nf_conntrack_find(&conn->tuple);
+		found    = __nf_conntrack_find(&init_net, &conn->tuple);
 		found_ct = NULL;
 
 		if (found != NULL)
-- 
cgit v1.2.3


From b21f89019399ff75d9c239010e38b840eb6e01e7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:03 +0200
Subject: netfilter: netns: fix {ip,6}_route_me_harder() in netns

Take netns from skb->dst->dev. It should be safe because, they are called
from LOCAL_OUT hook where dst is valid (though, I'm not exactly sure about
IPVS and queueing packets to userspace).

[Patrick: its safe everywhere since they already expect skb->dst to be set]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter.c | 7 ++++---
 net/ipv6/netfilter.c | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 01671ad51ed..6efdb70b3eb 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -12,6 +12,7 @@
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 {
+	struct net *net = dev_net(skb->dst->dev);
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi fl = {};
@@ -19,7 +20,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 	unsigned int hh_len;
 	unsigned int type;
 
-	type = inet_addr_type(&init_net, iph->saddr);
+	type = inet_addr_type(net, iph->saddr);
 	if (skb->sk && inet_sk(skb->sk)->transparent)
 		type = RTN_LOCAL;
 	if (addr_type == RTN_UNSPEC)
@@ -36,7 +37,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
 		fl.mark = skb->mark;
 		fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
-		if (ip_route_output_key(&init_net, &rt, &fl) != 0)
+		if (ip_route_output_key(net, &rt, &fl) != 0)
 			return -1;
 
 		/* Drop old route. */
@@ -46,7 +47,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		/* non-local src, find valid iif to satisfy
 		 * rp-filter when calling ip_route_input. */
 		fl.nl_u.ip4_u.daddr = iph->saddr;
-		if (ip_route_output_key(&init_net, &rt, &fl) != 0)
+		if (ip_route_output_key(net, &rt, &fl) != 0)
 			return -1;
 
 		odst = skb->dst;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 8c6c5e71f21..4cb4844a322 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -23,7 +23,7 @@ int ip6_route_me_harder(struct sk_buff *skb)
 		    .saddr = iph->saddr, } },
 	};
 
-	dst = ip6_route_output(&init_net, skb->sk, &fl);
+	dst = ip6_route_output(dev_net(skb->dst->dev), skb->sk, &fl);
 
 #ifdef CONFIG_XFRM
 	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
-- 
cgit v1.2.3


From 9b03f38d0487f3908696242286d934c9b38f9d2a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:03 +0200
Subject: netfilter: netns nf_conntrack: per-netns expectations

Make per-netns a) expectation hash and b) expectations count.

Expectations always belongs to netns to which it's master conntrack belong.
This is natural and doesn't bloat expectation.

Proc files and leaf users are stubbed to init_net, this is temporary.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c   |  6 ++-
 net/ipv4/netfilter/nf_nat_pptp.c                   |  2 +-
 net/netfilter/nf_conntrack_core.c                  |  8 ++--
 net/netfilter/nf_conntrack_expect.c                | 55 +++++++++++-----------
 net/netfilter/nf_conntrack_h323_main.c             |  2 +-
 net/netfilter/nf_conntrack_helper.c                |  2 +-
 net/netfilter/nf_conntrack_netlink.c               | 13 ++---
 net/netfilter/nf_conntrack_pptp.c                  |  4 +-
 net/netfilter/nf_conntrack_sip.c                   |  2 +-
 9 files changed, 49 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 8e0afdc2b13..f8636a57e8c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -177,11 +177,12 @@ struct ct_expect_iter_state {
 
 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 {
+	struct net *net = &init_net;
 	struct ct_expect_iter_state *st = seq->private;
 	struct hlist_node *n;
 
 	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
-		n = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
+		n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
 		if (n)
 			return n;
 	}
@@ -191,13 +192,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
 					     struct hlist_node *head)
 {
+	struct net *net = &init_net;
 	struct ct_expect_iter_state *st = seq->private;
 
 	head = rcu_dereference(head->next);
 	while (head == NULL) {
 		if (++st->bucket >= nf_ct_expect_hsize)
 			return NULL;
-		head = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
+		head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
 	}
 	return head;
 }
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index da3d91a5ef5..e4bdddc6034 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -73,7 +73,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 
 	pr_debug("trying to unexpect other dir: ");
 	nf_ct_dump_tuple_ip(&t);
-	other_exp = nf_ct_expect_find_get(&t);
+	other_exp = nf_ct_expect_find_get(&init_net, &t);
 	if (other_exp) {
 		nf_ct_unexpect_related(other_exp);
 		nf_ct_expect_put(other_exp);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index da56b260552..c188edea249 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -562,7 +562,7 @@ init_conntrack(struct net *net,
 	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
 
 	spin_lock_bh(&nf_conntrack_lock);
-	exp = nf_ct_find_expectation(tuple);
+	exp = nf_ct_find_expectation(net, tuple);
 	if (exp) {
 		pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
 			 ct, exp);
@@ -1038,7 +1038,7 @@ void nf_conntrack_cleanup(struct net *net)
 			     nf_conntrack_htable_size);
 
 	nf_conntrack_acct_fini();
-	nf_conntrack_expect_fini();
+	nf_conntrack_expect_fini(net);
 	nf_conntrack_helper_fini();
 	nf_conntrack_proto_fini();
 }
@@ -1173,7 +1173,7 @@ int nf_conntrack_init(struct net *net)
 	if (ret < 0)
 		goto err_free_conntrack_slab;
 
-	ret = nf_conntrack_expect_init();
+	ret = nf_conntrack_expect_init(net);
 	if (ret < 0)
 		goto out_fini_proto;
 
@@ -1203,7 +1203,7 @@ int nf_conntrack_init(struct net *net)
 out_fini_helper:
 	nf_conntrack_helper_fini();
 out_fini_expect:
-	nf_conntrack_expect_fini();
+	nf_conntrack_expect_fini(net);
 out_fini_proto:
 	nf_conntrack_proto_fini();
 err_free_conntrack_slab:
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index e6a79f2a7c5..5307316356e 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -28,17 +28,12 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 
-struct hlist_head *nf_ct_expect_hash __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
-
 unsigned int nf_ct_expect_hsize __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
 
 static unsigned int nf_ct_expect_hash_rnd __read_mostly;
-static unsigned int nf_ct_expect_count;
 unsigned int nf_ct_expect_max __read_mostly;
 static int nf_ct_expect_hash_rnd_initted __read_mostly;
-static int nf_ct_expect_vmalloc;
 
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
@@ -46,12 +41,13 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
+	struct net *net = nf_ct_exp_net(exp);
 
 	NF_CT_ASSERT(master_help);
 	NF_CT_ASSERT(!timer_pending(&exp->timeout));
 
 	hlist_del_rcu(&exp->hnode);
-	nf_ct_expect_count--;
+	net->ct.expect_count--;
 
 	hlist_del(&exp->lnode);
 	master_help->expecting[exp->class]--;
@@ -87,17 +83,17 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple
 }
 
 struct nf_conntrack_expect *
-__nf_ct_expect_find(const struct nf_conntrack_tuple *tuple)
+__nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_expect *i;
 	struct hlist_node *n;
 	unsigned int h;
 
-	if (!nf_ct_expect_count)
+	if (!net->ct.expect_count)
 		return NULL;
 
 	h = nf_ct_expect_dst_hash(tuple);
-	hlist_for_each_entry_rcu(i, n, &nf_ct_expect_hash[h], hnode) {
+	hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) {
 		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
 			return i;
 	}
@@ -107,12 +103,12 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
 
 /* Just find a expectation corresponding to a tuple. */
 struct nf_conntrack_expect *
-nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple)
+nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_expect *i;
 
 	rcu_read_lock();
-	i = __nf_ct_expect_find(tuple);
+	i = __nf_ct_expect_find(net, tuple);
 	if (i && !atomic_inc_not_zero(&i->use))
 		i = NULL;
 	rcu_read_unlock();
@@ -124,17 +120,17 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
 /* If an expectation for this connection is found, it gets delete from
  * global list then returned. */
 struct nf_conntrack_expect *
-nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple)
+nf_ct_find_expectation(struct net *net, const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_expect *i, *exp = NULL;
 	struct hlist_node *n;
 	unsigned int h;
 
-	if (!nf_ct_expect_count)
+	if (!net->ct.expect_count)
 		return NULL;
 
 	h = nf_ct_expect_dst_hash(tuple);
-	hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) {
+	hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
 		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
 		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
 			exp = i;
@@ -311,6 +307,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put);
 static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
+	struct net *net = nf_ct_exp_net(exp);
 	const struct nf_conntrack_expect_policy *p;
 	unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
 
@@ -319,8 +316,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	hlist_add_head(&exp->lnode, &master_help->expectations);
 	master_help->expecting[exp->class]++;
 
-	hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
-	nf_ct_expect_count++;
+	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
+	net->ct.expect_count++;
 
 	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
 		    (unsigned long)exp);
@@ -371,6 +368,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect)
 	struct nf_conntrack_expect *i;
 	struct nf_conn *master = expect->master;
 	struct nf_conn_help *master_help = nfct_help(master);
+	struct net *net = nf_ct_exp_net(expect);
 	struct hlist_node *n;
 	unsigned int h;
 	int ret;
@@ -383,7 +381,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect)
 		goto out;
 	}
 	h = nf_ct_expect_dst_hash(&expect->tuple);
-	hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) {
+	hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
 		if (expect_matches(i, expect)) {
 			/* Refresh timer: if it's dying, ignore.. */
 			if (refresh_timer(i)) {
@@ -406,7 +404,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect)
 		}
 	}
 
-	if (nf_ct_expect_count >= nf_ct_expect_max) {
+	if (net->ct.expect_count >= nf_ct_expect_max) {
 		if (net_ratelimit())
 			printk(KERN_WARNING
 			       "nf_conntrack: expectation table full\n");
@@ -430,11 +428,12 @@ struct ct_expect_iter_state {
 
 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 {
+	struct net *net = &init_net;
 	struct ct_expect_iter_state *st = seq->private;
 	struct hlist_node *n;
 
 	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
-		n = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
+		n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
 		if (n)
 			return n;
 	}
@@ -444,13 +443,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
 					     struct hlist_node *head)
 {
+	struct net *net = &init_net;
 	struct ct_expect_iter_state *st = seq->private;
 
 	head = rcu_dereference(head->next);
 	while (head == NULL) {
 		if (++st->bucket >= nf_ct_expect_hsize)
 			return NULL;
-		head = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
+		head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
 	}
 	return head;
 }
@@ -558,7 +558,7 @@ static void exp_proc_remove(void)
 
 module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600);
 
-int nf_conntrack_expect_init(void)
+int nf_conntrack_expect_init(struct net *net)
 {
 	int err = -ENOMEM;
 
@@ -569,9 +569,10 @@ int nf_conntrack_expect_init(void)
 	}
 	nf_ct_expect_max = nf_ct_expect_hsize * 4;
 
-	nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
-						  &nf_ct_expect_vmalloc);
-	if (nf_ct_expect_hash == NULL)
+	net->ct.expect_count = 0;
+	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
+						  &net->ct.expect_vmalloc);
+	if (net->ct.expect_hash == NULL)
 		goto err1;
 
 	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
@@ -589,16 +590,16 @@ int nf_conntrack_expect_init(void)
 err3:
 	kmem_cache_destroy(nf_ct_expect_cachep);
 err2:
-	nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
+	nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
 			     nf_ct_expect_hsize);
 err1:
 	return err;
 }
 
-void nf_conntrack_expect_fini(void)
+void nf_conntrack_expect_fini(struct net *net)
 {
 	exp_proc_remove();
 	kmem_cache_destroy(nf_ct_expect_cachep);
-	nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
+	nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
 			     nf_ct_expect_hsize);
 }
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5dc0478108a..dfb826c973d 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1219,7 +1219,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
 	tuple.dst.u.tcp.port = port;
 	tuple.dst.protonum = IPPROTO_TCP;
 
-	exp = __nf_ct_expect_find(&tuple);
+	exp = __nf_ct_expect_find(&init_net, &tuple);
 	if (exp && exp->master == ct)
 		return exp;
 	return NULL;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index d91278dfdaf..c793db810cd 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -145,7 +145,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 	/* Get rid of expectations */
 	for (i = 0; i < nf_ct_expect_hsize; i++) {
 		hlist_for_each_entry_safe(exp, n, next,
-					  &nf_ct_expect_hash[i], hnode) {
+					  &init_net.ct.expect_hash[i], hnode) {
 			struct nf_conn_help *help = nfct_help(exp->master);
 			if ((help->helper == me || exp->helper == me) &&
 			    del_timer(&exp->timeout)) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 918a3358a12..cadfd15b44f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1458,6 +1458,7 @@ static int ctnetlink_exp_done(struct netlink_callback *cb)
 static int
 ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = &init_net;
 	struct nf_conntrack_expect *exp, *last;
 	struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh);
 	struct hlist_node *n;
@@ -1467,7 +1468,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 	last = (struct nf_conntrack_expect *)cb->args[1];
 	for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
 restart:
-		hlist_for_each_entry(exp, n, &nf_ct_expect_hash[cb->args[0]],
+		hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]],
 				     hnode) {
 			if (l3proto && exp->tuple.src.l3num != l3proto)
 				continue;
@@ -1529,7 +1530,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	exp = nf_ct_expect_find_get(&tuple);
+	exp = nf_ct_expect_find_get(&init_net, &tuple);
 	if (!exp)
 		return -ENOENT;
 
@@ -1583,7 +1584,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 			return err;
 
 		/* bump usage count to 2 */
-		exp = nf_ct_expect_find_get(&tuple);
+		exp = nf_ct_expect_find_get(&init_net, &tuple);
 		if (!exp)
 			return -ENOENT;
 
@@ -1613,7 +1614,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		}
 		for (i = 0; i < nf_ct_expect_hsize; i++) {
 			hlist_for_each_entry_safe(exp, n, next,
-						  &nf_ct_expect_hash[i],
+						  &init_net.ct.expect_hash[i],
 						  hnode) {
 				m_help = nfct_help(exp->master);
 				if (m_help->helper == h
@@ -1629,7 +1630,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		spin_lock_bh(&nf_conntrack_lock);
 		for (i = 0; i < nf_ct_expect_hsize; i++) {
 			hlist_for_each_entry_safe(exp, n, next,
-						  &nf_ct_expect_hash[i],
+						  &init_net.ct.expect_hash[i],
 						  hnode) {
 				if (del_timer(&exp->timeout)) {
 					nf_ct_unlink_expect(exp);
@@ -1724,7 +1725,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	spin_lock_bh(&nf_conntrack_lock);
-	exp = __nf_ct_expect_find(&tuple);
+	exp = __nf_ct_expect_find(&init_net, &tuple);
 
 	if (!exp) {
 		spin_unlock_bh(&nf_conntrack_lock);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 7caf45b59d2..5db7df5d19b 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -121,7 +121,7 @@ static void pptp_expectfn(struct nf_conn *ct,
 		pr_debug("trying to unexpect other dir: ");
 		nf_ct_dump_tuple(&inv_t);
 
-		exp_other = nf_ct_expect_find_get(&inv_t);
+		exp_other = nf_ct_expect_find_get(&init_net, &inv_t);
 		if (exp_other) {
 			/* delete other expectation.  */
 			pr_debug("found\n");
@@ -154,7 +154,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
 		nf_ct_put(sibling);
 		return 1;
 	} else {
-		exp = nf_ct_expect_find_get(t);
+		exp = nf_ct_expect_find_get(&init_net, t);
 		if (exp) {
 			pr_debug("unexpect_related of expect %p\n", exp);
 			nf_ct_unexpect_related(exp);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 1fa306be60f..a006080eb38 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -775,7 +775,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb,
 
 	rcu_read_lock();
 	do {
-		exp = __nf_ct_expect_find(&tuple);
+		exp = __nf_ct_expect_find(&init_net, &tuple);
 
 		if (!exp || exp->master == ct ||
 		    nfct_help(exp->master)->helper != nfct_help(ct)->helper ||
-- 
cgit v1.2.3


From 63c9a26264be108b52de087724673f8664570e34 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:04 +0200
Subject: netfilter: netns nf_conntrack: per-netns unconfirmed list

What is confirmed connection in one netns can very well be unconfirmed
in another one.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c   | 7 ++++---
 net/netfilter/nf_conntrack_helper.c | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c188edea249..2a105db1330 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -54,7 +54,6 @@ struct nf_conn nf_conntrack_untracked __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 
 unsigned int nf_ct_log_invalid __read_mostly;
-HLIST_HEAD(unconfirmed);
 static struct kmem_cache *nf_conntrack_cachep __read_mostly;
 
 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
@@ -596,7 +595,8 @@ init_conntrack(struct net *net,
 	}
 
 	/* Overload tuple linked list to put us in unconfirmed list. */
-	hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &unconfirmed);
+	hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
+		       &net->ct.unconfirmed);
 
 	spin_unlock_bh(&nf_conntrack_lock);
 
@@ -957,7 +957,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 				goto found;
 		}
 	}
-	hlist_for_each_entry(h, n, &unconfirmed, hnode) {
+	hlist_for_each_entry(h, n, &net->ct.unconfirmed, hnode) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (iter(ct, data))
 			set_bit(IPS_DYING_BIT, &ct->status);
@@ -1154,6 +1154,7 @@ int nf_conntrack_init(struct net *net)
 		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
 		goto err_out;
 	}
+	INIT_HLIST_HEAD(&net->ct.unconfirmed);
 
 	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
 
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index c793db810cd..920e778539a 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -156,7 +156,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 	}
 
 	/* Get rid of expecteds, set helpers to NULL. */
-	hlist_for_each_entry(h, n, &unconfirmed, hnode)
+	hlist_for_each_entry(h, n, &init_net.ct.unconfirmed, hnode)
 		unhelp(h, me);
 	for (i = 0; i < nf_conntrack_htable_size; i++) {
 		hlist_for_each_entry(h, n, &init_net.ct.hash[i], hnode)
-- 
cgit v1.2.3


From a702a65fc1376fc1f6757ec2a6960348af3f1876 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:04 +0200
Subject: netfilter: netns nf_conntrack: pass netns pointer to
 nf_conntrack_in()

It's deducible from skb->dev or skb->dst->dev, but we know netns at
the moment of call, so pass it down and use for finding and creating
conntracks.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  4 ++--
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 24 ++++++++++++++++--------
 net/netfilter/nf_conntrack_core.c              | 15 ++++++++-------
 3 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 03dd108015c..2e4dd3fb002 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -172,7 +172,7 @@ static unsigned int ipv4_conntrack_in(unsigned int hooknum,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
-	return nf_conntrack_in(PF_INET, hooknum, skb);
+	return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
 }
 
 static unsigned int ipv4_conntrack_local(unsigned int hooknum,
@@ -188,7 +188,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
 			printk("ipt_hook: happy cracking.\n");
 		return NF_ACCEPT;
 	}
-	return nf_conntrack_in(PF_INET, hooknum, skb);
+	return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
 }
 
 /* Connection tracking may drop packets, but never alters them, so
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 85050c072ab..e91db16611d 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -211,11 +211,10 @@ static unsigned int ipv6_defrag(unsigned int hooknum,
 	return NF_STOLEN;
 }
 
-static unsigned int ipv6_conntrack_in(unsigned int hooknum,
-				      struct sk_buff *skb,
-				      const struct net_device *in,
-				      const struct net_device *out,
-				      int (*okfn)(struct sk_buff *))
+static unsigned int __ipv6_conntrack_in(struct net *net,
+					unsigned int hooknum,
+					struct sk_buff *skb,
+					int (*okfn)(struct sk_buff *))
 {
 	struct sk_buff *reasm = skb->nfct_reasm;
 
@@ -225,7 +224,7 @@ static unsigned int ipv6_conntrack_in(unsigned int hooknum,
 		if (!reasm->nfct) {
 			unsigned int ret;
 
-			ret = nf_conntrack_in(PF_INET6, hooknum, reasm);
+			ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm);
 			if (ret != NF_ACCEPT)
 				return ret;
 		}
@@ -235,7 +234,16 @@ static unsigned int ipv6_conntrack_in(unsigned int hooknum,
 		return NF_ACCEPT;
 	}
 
-	return nf_conntrack_in(PF_INET6, hooknum, skb);
+	return nf_conntrack_in(net, PF_INET6, hooknum, skb);
+}
+
+static unsigned int ipv6_conntrack_in(unsigned int hooknum,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn);
 }
 
 static unsigned int ipv6_conntrack_local(unsigned int hooknum,
@@ -250,7 +258,7 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
 			printk("ipv6_conntrack_local: packet too short\n");
 		return NF_ACCEPT;
 	}
-	return ipv6_conntrack_in(hooknum, skb, in, out, okfn);
+	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn);
 }
 
 static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 2a105db1330..5c96d9732c7 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -611,7 +611,8 @@ init_conntrack(struct net *net,
 
 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
 static inline struct nf_conn *
-resolve_normal_ct(struct sk_buff *skb,
+resolve_normal_ct(struct net *net,
+		  struct sk_buff *skb,
 		  unsigned int dataoff,
 		  u_int16_t l3num,
 		  u_int8_t protonum,
@@ -632,10 +633,9 @@ resolve_normal_ct(struct sk_buff *skb,
 	}
 
 	/* look for tuple match */
-	h = nf_conntrack_find_get(&init_net, &tuple);
+	h = nf_conntrack_find_get(net, &tuple);
 	if (!h) {
-		h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb,
-				   dataoff);
+		h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
 		if (!h)
 			return NULL;
 		if (IS_ERR(h))
@@ -669,7 +669,8 @@ resolve_normal_ct(struct sk_buff *skb,
 }
 
 unsigned int
-nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb)
+nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
+		struct sk_buff *skb)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -709,8 +710,8 @@ nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb)
 		return -ret;
 	}
 
-	ct = resolve_normal_ct(skb, dataoff, pf, protonum, l3proto, l4proto,
-			       &set_reply, &ctinfo);
+	ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
+			       l3proto, l4proto, &set_reply, &ctinfo);
 	if (!ct) {
 		/* Not valid part of a connection */
 		NF_CT_STAT_INC_ATOMIC(invalid);
-- 
cgit v1.2.3


From 74c51a1497033e6ff7b8096797daca233a4a30df Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:05 +0200
Subject: netfilter: netns nf_conntrack: pass netns pointer to L4 protocol's
 ->error hook

Again, it's deducible from skb, but we're going to use it for
nf_conntrack_checksum and statistics, so just pass it from upper layer.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  8 ++++----
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  9 +++++----
 net/netfilter/nf_conntrack_core.c              | 12 +++++++-----
 net/netfilter/nf_conntrack_proto_dccp.c        |  6 +++---
 net/netfilter/nf_conntrack_proto_tcp.c         |  3 ++-
 net/netfilter/nf_conntrack_proto_udp.c         |  2 +-
 net/netfilter/nf_conntrack_proto_udplite.c     |  4 +++-
 7 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index daf346377b6..8c7ed5bc959 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -123,7 +123,7 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
 static int
-icmp_error_message(struct sk_buff *skb,
+icmp_error_message(struct net *net, struct sk_buff *skb,
 		 enum ip_conntrack_info *ctinfo,
 		 unsigned int hooknum)
 {
@@ -155,7 +155,7 @@ icmp_error_message(struct sk_buff *skb,
 
 	*ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(&init_net, &innertuple);
+	h = nf_conntrack_find_get(net, &innertuple);
 	if (!h) {
 		pr_debug("icmp_error_message: no match\n");
 		return -NF_ACCEPT;
@@ -172,7 +172,7 @@ icmp_error_message(struct sk_buff *skb,
 
 /* Small and modified version of icmp_rcv */
 static int
-icmp_error(struct sk_buff *skb, unsigned int dataoff,
+icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	   enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
 {
 	const struct icmphdr *icmph;
@@ -217,7 +217,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
 	    && icmph->type != ICMP_REDIRECT)
 		return NF_ACCEPT;
 
-	return icmp_error_message(skb, ctinfo, hooknum);
+	return icmp_error_message(net, skb, ctinfo, hooknum);
 }
 
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 548cf4f15c0..aabddfe2127 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -122,7 +122,8 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
 }
 
 static int
-icmpv6_error_message(struct sk_buff *skb,
+icmpv6_error_message(struct net *net,
+		     struct sk_buff *skb,
 		     unsigned int icmp6off,
 		     enum ip_conntrack_info *ctinfo,
 		     unsigned int hooknum)
@@ -156,7 +157,7 @@ icmpv6_error_message(struct sk_buff *skb,
 
 	*ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(&init_net, &intuple);
+	h = nf_conntrack_find_get(net, &intuple);
 	if (!h) {
 		pr_debug("icmpv6_error: no match\n");
 		return -NF_ACCEPT;
@@ -172,7 +173,7 @@ icmpv6_error_message(struct sk_buff *skb,
 }
 
 static int
-icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
+icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	     enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
 {
 	const struct icmp6hdr *icmp6h;
@@ -197,7 +198,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
 	if (icmp6h->icmp6_type >= 128)
 		return NF_ACCEPT;
 
-	return icmpv6_error_message(skb, dataoff, ctinfo, hooknum);
+	return icmpv6_error_message(net, skb, dataoff, ctinfo, hooknum);
 }
 
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5c96d9732c7..251f020c7c1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -703,11 +703,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	/* It may be an special packet, error, unclean...
 	 * inverse of the return code tells to the netfilter
 	 * core what to do with the packet. */
-	if (l4proto->error != NULL &&
-	    (ret = l4proto->error(skb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
-		NF_CT_STAT_INC_ATOMIC(error);
-		NF_CT_STAT_INC_ATOMIC(invalid);
-		return -ret;
+	if (l4proto->error != NULL) {
+		ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum);
+		if (ret <= 0) {
+			NF_CT_STAT_INC_ATOMIC(error);
+			NF_CT_STAT_INC_ATOMIC(invalid);
+			return -ret;
+		}
 	}
 
 	ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index edc30358dc1..6ead8da3e9e 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -545,9 +545,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
-static int dccp_error(struct sk_buff *skb, unsigned int dataoff,
-		      enum ip_conntrack_info *ctinfo, u_int8_t pf,
-		      unsigned int hooknum)
+static int dccp_error(struct net *net, struct sk_buff *skb,
+		      unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+		      u_int8_t pf, unsigned int hooknum)
 {
 	struct dccp_hdr _dh, *dh;
 	unsigned int dccp_len = skb->len - dataoff;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 539a8202025..4e71de2405f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -746,7 +746,8 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] =
 };
 
 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
-static int tcp_error(struct sk_buff *skb,
+static int tcp_error(struct net *net,
+		     struct sk_buff *skb,
 		     unsigned int dataoff,
 		     enum ip_conntrack_info *ctinfo,
 		     u_int8_t pf,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 2a965c4a0ea..8a245beb2c9 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -89,7 +89,7 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
-static int udp_error(struct sk_buff *skb, unsigned int dataoff,
+static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 		     enum ip_conntrack_info *ctinfo,
 		     u_int8_t pf,
 		     unsigned int hooknum)
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 4fb6c8d83a8..981701919a7 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -89,7 +89,9 @@ static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
-static int udplite_error(struct sk_buff *skb, unsigned int dataoff,
+static int udplite_error(struct net *net,
+			 struct sk_buff *skb,
+			 unsigned int dataoff,
 			 enum ip_conntrack_info *ctinfo,
 			 u_int8_t pf,
 			 unsigned int hooknum)
-- 
cgit v1.2.3


From b2ce2c7479d9b60dd268203e56bb738e78fd5fda Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:05 +0200
Subject: netfilter: netns nf_conntrack: per-netns /proc/net/nf_conntrack,
 /proc/net/stat/nf_conntrack

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_standalone.c | 51 ++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5456e4b9424..02eaf872277 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -40,18 +40,20 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 EXPORT_SYMBOL_GPL(print_tuple);
 
 struct ct_iter_state {
+	struct seq_net_private p;
 	unsigned int bucket;
 };
 
 static struct hlist_node *ct_get_first(struct seq_file *seq)
 {
+	struct net *net = seq_file_net(seq);
 	struct ct_iter_state *st = seq->private;
 	struct hlist_node *n;
 
 	for (st->bucket = 0;
 	     st->bucket < nf_conntrack_htable_size;
 	     st->bucket++) {
-		n = rcu_dereference(init_net.ct.hash[st->bucket].first);
+		n = rcu_dereference(net->ct.hash[st->bucket].first);
 		if (n)
 			return n;
 	}
@@ -61,13 +63,14 @@ static struct hlist_node *ct_get_first(struct seq_file *seq)
 static struct hlist_node *ct_get_next(struct seq_file *seq,
 				      struct hlist_node *head)
 {
+	struct net *net = seq_file_net(seq);
 	struct ct_iter_state *st = seq->private;
 
 	head = rcu_dereference(head->next);
 	while (head == NULL) {
 		if (++st->bucket >= nf_conntrack_htable_size)
 			return NULL;
-		head = rcu_dereference(init_net.ct.hash[st->bucket].first);
+		head = rcu_dereference(net->ct.hash[st->bucket].first);
 	}
 	return head;
 }
@@ -177,7 +180,7 @@ static const struct seq_operations ct_seq_ops = {
 
 static int ct_open(struct inode *inode, struct file *file)
 {
-	return seq_open_private(file, &ct_seq_ops,
+	return seq_open_net(inode, file, &ct_seq_ops,
 			sizeof(struct ct_iter_state));
 }
 
@@ -186,7 +189,7 @@ static const struct file_operations ct_file_ops = {
 	.open    = ct_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release_private,
+	.release = seq_release_net,
 };
 
 static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
@@ -277,38 +280,38 @@ static const struct file_operations ct_cpu_seq_fops = {
 	.release = seq_release,
 };
 
-static int nf_conntrack_standalone_init_proc(void)
+static int nf_conntrack_standalone_init_proc(struct net *net)
 {
 	struct proc_dir_entry *pde;
 
-	pde = proc_net_fops_create(&init_net, "nf_conntrack", 0440, &ct_file_ops);
+	pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops);
 	if (!pde)
 		goto out_nf_conntrack;
 
-	pde = proc_create("nf_conntrack", S_IRUGO, init_net.proc_net_stat,
+	pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
 			  &ct_cpu_seq_fops);
 	if (!pde)
 		goto out_stat_nf_conntrack;
 	return 0;
 
 out_stat_nf_conntrack:
-	proc_net_remove(&init_net, "nf_conntrack");
+	proc_net_remove(net, "nf_conntrack");
 out_nf_conntrack:
 	return -ENOMEM;
 }
 
-static void nf_conntrack_standalone_fini_proc(void)
+static void nf_conntrack_standalone_fini_proc(struct net *net)
 {
-	remove_proc_entry("nf_conntrack", init_net.proc_net_stat);
-	proc_net_remove(&init_net, "nf_conntrack");
+	remove_proc_entry("nf_conntrack", net->proc_net_stat);
+	proc_net_remove(net, "nf_conntrack");
 }
 #else
-static int nf_conntrack_standalone_init_proc(void)
+static int nf_conntrack_standalone_init_proc(struct net *net)
 {
 	return 0;
 }
 
-static void nf_conntrack_standalone_fini_proc(void)
+static void nf_conntrack_standalone_fini_proc(struct net *net)
 {
 }
 #endif /* CONFIG_PROC_FS */
@@ -442,11 +445,25 @@ static void nf_conntrack_standalone_fini_sysctl(void)
 
 static int nf_conntrack_net_init(struct net *net)
 {
-	return nf_conntrack_init(net);
+	int ret;
+
+	ret = nf_conntrack_init(net);
+	if (ret < 0)
+		goto out_init;
+	ret = nf_conntrack_standalone_init_proc(net);
+	if (ret < 0)
+		goto out_proc;
+	return 0;
+
+out_proc:
+	nf_conntrack_cleanup(net);
+out_init:
+	return ret;
 }
 
 static void nf_conntrack_net_exit(struct net *net)
 {
+	nf_conntrack_standalone_fini_proc(net);
 	nf_conntrack_cleanup(net);
 }
 
@@ -462,17 +479,12 @@ static int __init nf_conntrack_standalone_init(void)
 	ret = register_pernet_subsys(&nf_conntrack_net_ops);
 	if (ret < 0)
 		goto out;
-	ret = nf_conntrack_standalone_init_proc();
-	if (ret < 0)
-		goto out_proc;
 	ret = nf_conntrack_standalone_init_sysctl();
 	if (ret < 0)
 		goto out_sysctl;
 	return 0;
 
 out_sysctl:
-	nf_conntrack_standalone_fini_proc();
-out_proc:
 	unregister_pernet_subsys(&nf_conntrack_net_ops);
 out:
 	return ret;
@@ -481,7 +493,6 @@ out:
 static void __exit nf_conntrack_standalone_fini(void)
 {
 	nf_conntrack_standalone_fini_sysctl();
-	nf_conntrack_standalone_fini_proc();
 	unregister_pernet_subsys(&nf_conntrack_net_ops);
 }
 
-- 
cgit v1.2.3


From dc5129f8df7cc3f2f04b322728d71c42795d34cc Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:06 +0200
Subject: netfilter: netns nf_conntrack: per-netns
 /proc/net/nf_conntrack_expect

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_expect.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 5307316356e..6a09200049e 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -423,12 +423,13 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_related);
 
 #ifdef CONFIG_PROC_FS
 struct ct_expect_iter_state {
+	struct seq_net_private p;
 	unsigned int bucket;
 };
 
 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 {
-	struct net *net = &init_net;
+	struct net *net = seq_file_net(seq);
 	struct ct_expect_iter_state *st = seq->private;
 	struct hlist_node *n;
 
@@ -443,7 +444,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
 					     struct hlist_node *head)
 {
-	struct net *net = &init_net;
+	struct net *net = seq_file_net(seq);
 	struct ct_expect_iter_state *st = seq->private;
 
 	head = rcu_dereference(head->next);
@@ -524,7 +525,7 @@ static const struct seq_operations exp_seq_ops = {
 
 static int exp_open(struct inode *inode, struct file *file)
 {
-	return seq_open_private(file, &exp_seq_ops,
+	return seq_open_net(inode, file, &exp_seq_ops,
 			sizeof(struct ct_expect_iter_state));
 }
 
@@ -533,26 +534,26 @@ static const struct file_operations exp_file_ops = {
 	.open    = exp_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release_private,
+	.release = seq_release_net,
 };
 #endif /* CONFIG_PROC_FS */
 
-static int exp_proc_init(void)
+static int exp_proc_init(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *proc;
 
-	proc = proc_net_fops_create(&init_net, "nf_conntrack_expect", 0440, &exp_file_ops);
+	proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops);
 	if (!proc)
 		return -ENOMEM;
 #endif /* CONFIG_PROC_FS */
 	return 0;
 }
 
-static void exp_proc_remove(void)
+static void exp_proc_remove(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-	proc_net_remove(&init_net, "nf_conntrack_expect");
+	proc_net_remove(net, "nf_conntrack_expect");
 #endif /* CONFIG_PROC_FS */
 }
 
@@ -581,7 +582,7 @@ int nf_conntrack_expect_init(struct net *net)
 	if (!nf_ct_expect_cachep)
 		goto err2;
 
-	err = exp_proc_init();
+	err = exp_proc_init(net);
 	if (err < 0)
 		goto err3;
 
@@ -598,7 +599,7 @@ err1:
 
 void nf_conntrack_expect_fini(struct net *net)
 {
-	exp_proc_remove();
+	exp_proc_remove(net);
 	kmem_cache_destroy(nf_ct_expect_cachep);
 	nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
 			     nf_ct_expect_hsize);
-- 
cgit v1.2.3


From 5e6b29972b7e9c9c39882227e36fe0cd3463fe96 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:06 +0200
Subject: netfilter: netns nf_conntrack: per-netns /proc/net/ip_conntrack,
 /proc/net/stat/ip_conntrack, /proc/net/ip_conntrack_expect

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c   | 57 ++++++++++++++--------
 1 file changed, 38 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index f8636a57e8c..b2940836d10 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -21,18 +21,20 @@
 #include <net/netfilter/nf_conntrack_acct.h>
 
 struct ct_iter_state {
+	struct seq_net_private p;
 	unsigned int bucket;
 };
 
 static struct hlist_node *ct_get_first(struct seq_file *seq)
 {
+	struct net *net = seq_file_net(seq);
 	struct ct_iter_state *st = seq->private;
 	struct hlist_node *n;
 
 	for (st->bucket = 0;
 	     st->bucket < nf_conntrack_htable_size;
 	     st->bucket++) {
-		n = rcu_dereference(init_net.ct.hash[st->bucket].first);
+		n = rcu_dereference(net->ct.hash[st->bucket].first);
 		if (n)
 			return n;
 	}
@@ -42,13 +44,14 @@ static struct hlist_node *ct_get_first(struct seq_file *seq)
 static struct hlist_node *ct_get_next(struct seq_file *seq,
 				      struct hlist_node *head)
 {
+	struct net *net = seq_file_net(seq);
 	struct ct_iter_state *st = seq->private;
 
 	head = rcu_dereference(head->next);
 	while (head == NULL) {
 		if (++st->bucket >= nf_conntrack_htable_size)
 			return NULL;
-		head = rcu_dereference(init_net.ct.hash[st->bucket].first);
+		head = rcu_dereference(net->ct.hash[st->bucket].first);
 	}
 	return head;
 }
@@ -158,8 +161,8 @@ static const struct seq_operations ct_seq_ops = {
 
 static int ct_open(struct inode *inode, struct file *file)
 {
-	return seq_open_private(file, &ct_seq_ops,
-			sizeof(struct ct_iter_state));
+	return seq_open_net(inode, file, &ct_seq_ops,
+			    sizeof(struct ct_iter_state));
 }
 
 static const struct file_operations ct_file_ops = {
@@ -167,17 +170,18 @@ static const struct file_operations ct_file_ops = {
 	.open    = ct_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release_private,
+	.release = seq_release_net,
 };
 
 /* expects */
 struct ct_expect_iter_state {
+	struct seq_net_private p;
 	unsigned int bucket;
 };
 
 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 {
-	struct net *net = &init_net;
+	struct net *net = seq_file_net(seq);
 	struct ct_expect_iter_state *st = seq->private;
 	struct hlist_node *n;
 
@@ -192,7 +196,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
 					     struct hlist_node *head)
 {
-	struct net *net = &init_net;
+	struct net *net = seq_file_net(seq);
 	struct ct_expect_iter_state *st = seq->private;
 
 	head = rcu_dereference(head->next);
@@ -267,8 +271,8 @@ static const struct seq_operations exp_seq_ops = {
 
 static int exp_open(struct inode *inode, struct file *file)
 {
-	return seq_open_private(file, &exp_seq_ops,
-			sizeof(struct ct_expect_iter_state));
+	return seq_open_net(inode, file, &exp_seq_ops,
+			    sizeof(struct ct_expect_iter_state));
 }
 
 static const struct file_operations ip_exp_file_ops = {
@@ -276,7 +280,7 @@ static const struct file_operations ip_exp_file_ops = {
 	.open    = exp_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release_private,
+	.release = seq_release_net,
 };
 
 static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
@@ -367,36 +371,51 @@ static const struct file_operations ct_cpu_seq_fops = {
 	.release = seq_release,
 };
 
-int __init nf_conntrack_ipv4_compat_init(void)
+static int __net_init ip_conntrack_net_init(struct net *net)
 {
 	struct proc_dir_entry *proc, *proc_exp, *proc_stat;
 
-	proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops);
+	proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops);
 	if (!proc)
 		goto err1;
 
-	proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440,
+	proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440,
 					&ip_exp_file_ops);
 	if (!proc_exp)
 		goto err2;
 
 	proc_stat = proc_create("ip_conntrack", S_IRUGO,
-				init_net.proc_net_stat, &ct_cpu_seq_fops);
+				net->proc_net_stat, &ct_cpu_seq_fops);
 	if (!proc_stat)
 		goto err3;
 	return 0;
 
 err3:
-	proc_net_remove(&init_net, "ip_conntrack_expect");
+	proc_net_remove(net, "ip_conntrack_expect");
 err2:
-	proc_net_remove(&init_net, "ip_conntrack");
+	proc_net_remove(net, "ip_conntrack");
 err1:
 	return -ENOMEM;
 }
 
+static void __net_exit ip_conntrack_net_exit(struct net *net)
+{
+	remove_proc_entry("ip_conntrack", net->proc_net_stat);
+	proc_net_remove(net, "ip_conntrack_expect");
+	proc_net_remove(net, "ip_conntrack");
+}
+
+static struct pernet_operations ip_conntrack_net_ops = {
+	.init = ip_conntrack_net_init,
+	.exit = ip_conntrack_net_exit,
+};
+
+int __init nf_conntrack_ipv4_compat_init(void)
+{
+	return register_pernet_subsys(&ip_conntrack_net_ops);
+}
+
 void __exit nf_conntrack_ipv4_compat_fini(void)
 {
-	remove_proc_entry("ip_conntrack", init_net.proc_net_stat);
-	proc_net_remove(&init_net, "ip_conntrack_expect");
-	proc_net_remove(&init_net, "ip_conntrack");
+	unregister_pernet_subsys(&ip_conntrack_net_ops);
 }
-- 
cgit v1.2.3


From b76a461f11eb5f32d37a9c8eae7b2f3b3f261b43 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:06 +0200
Subject: netns: export netns list

Conntrack code will use it for
a) removing expectations and helpers when corresponding module is removed, and
b) removing conntracks when L3 protocol conntrack module is removed.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/core/net_namespace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7c52fe277b6..b0dc818a91d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -18,6 +18,7 @@ static struct list_head *first_device = &pernet_list;
 static DEFINE_MUTEX(net_mutex);
 
 LIST_HEAD(net_namespace_list);
+EXPORT_SYMBOL_GPL(net_namespace_list);
 
 struct net init_net;
 EXPORT_SYMBOL(init_net);
-- 
cgit v1.2.3


From 68047937677f2dffb5c47b57ce8baba5714b2142 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:06 +0200
Subject: netfilter: netns nf_conntrack: unregister helper in every netns

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_helper.c | 40 ++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 920e778539a..9c06b9f86ad 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -123,29 +123,18 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_register);
 
-void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
+					     struct net *net)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_expect *exp;
 	const struct hlist_node *n, *next;
 	unsigned int i;
 
-	mutex_lock(&nf_ct_helper_mutex);
-	hlist_del_rcu(&me->hnode);
-	nf_ct_helper_count--;
-	mutex_unlock(&nf_ct_helper_mutex);
-
-	/* Make sure every nothing is still using the helper unless its a
-	 * connection in the hash.
-	 */
-	synchronize_rcu();
-
-	spin_lock_bh(&nf_conntrack_lock);
-
 	/* Get rid of expectations */
 	for (i = 0; i < nf_ct_expect_hsize; i++) {
 		hlist_for_each_entry_safe(exp, n, next,
-					  &init_net.ct.expect_hash[i], hnode) {
+					  &net->ct.expect_hash[i], hnode) {
 			struct nf_conn_help *help = nfct_help(exp->master);
 			if ((help->helper == me || exp->helper == me) &&
 			    del_timer(&exp->timeout)) {
@@ -156,12 +145,31 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 	}
 
 	/* Get rid of expecteds, set helpers to NULL. */
-	hlist_for_each_entry(h, n, &init_net.ct.unconfirmed, hnode)
+	hlist_for_each_entry(h, n, &net->ct.unconfirmed, hnode)
 		unhelp(h, me);
 	for (i = 0; i < nf_conntrack_htable_size; i++) {
-		hlist_for_each_entry(h, n, &init_net.ct.hash[i], hnode)
+		hlist_for_each_entry(h, n, &net->ct.hash[i], hnode)
 			unhelp(h, me);
 	}
+}
+
+void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+{
+	struct net *net;
+
+	mutex_lock(&nf_ct_helper_mutex);
+	hlist_del_rcu(&me->hnode);
+	nf_ct_helper_count--;
+	mutex_unlock(&nf_ct_helper_mutex);
+
+	/* Make sure every nothing is still using the helper unless its a
+	 * connection in the hash.
+	 */
+	synchronize_rcu();
+
+	spin_lock_bh(&nf_conntrack_lock);
+	for_each_net(net)
+		__nf_conntrack_helper_unregister(me, net);
 	spin_unlock_bh(&nf_conntrack_lock);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
-- 
cgit v1.2.3


From 678d66753091a4102910392fb6198a6c6ce7f510 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:07 +0200
Subject: netfilter: netns nf_conntrack: cleanup after L3 and L4 proto
 unregister in every netns

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_proto.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 3a2f7ef997f..a59a307e685 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -207,6 +207,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register);
 
 void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 {
+	struct net *net;
+
 	BUG_ON(proto->l3proto >= AF_MAX);
 
 	mutex_lock(&nf_ct_proto_mutex);
@@ -219,7 +221,8 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 	synchronize_rcu();
 
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_cleanup(&init_net, kill_l3proto, proto);
+	for_each_net(net)
+		nf_ct_iterate_cleanup(net, kill_l3proto, proto);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
 
@@ -316,6 +319,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register);
 
 void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
 {
+	struct net *net;
+
 	BUG_ON(l4proto->l3proto >= PF_MAX);
 
 	mutex_lock(&nf_ct_proto_mutex);
@@ -328,7 +333,8 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
 	synchronize_rcu();
 
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_cleanup(&init_net, kill_l4proto, l4proto);
+	for_each_net(net)
+		nf_ct_iterate_cleanup(net, kill_l4proto, l4proto);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister);
 
-- 
cgit v1.2.3


From a71996fccce4b2086a26036aa3c915365ca36926 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:07 +0200
Subject: netfilter: netns nf_conntrack: pass conntrack to
 nf_conntrack_event_cache() not skb

This is cleaner, we already know conntrack to which event is relevant.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv4/netfilter/nf_nat_helper.c             |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_core.c              | 10 +++++-----
 net/netfilter/nf_conntrack_ftp.c               |  9 +++++----
 net/netfilter/nf_conntrack_proto_gre.c         |  2 +-
 net/netfilter/nf_conntrack_proto_sctp.c        |  4 ++--
 net/netfilter/nf_conntrack_proto_tcp.c         |  6 +++---
 net/netfilter/nf_conntrack_proto_udp.c         |  2 +-
 net/netfilter/nf_conntrack_proto_udplite.c     |  2 +-
 net/netfilter/xt_CONNMARK.c                    |  8 ++++----
 net/netfilter/xt_CONNSECMARK.c                 |  2 +-
 12 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 8c7ed5bc959..205ba399d4a 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -91,7 +91,7 @@ static int icmp_packet(struct nf_conn *ct,
 			nf_ct_kill_acct(ct, ctinfo, skb);
 	} else {
 		atomic_inc(&ct->proto.icmp.count);
-		nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+		nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
 	}
 
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 112dcfa1290..cf7a42bf982 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -193,7 +193,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 		nf_conntrack_tcp_update(skb, ip_hdrlen(skb),
 					ct, CTINFO2DIR(ctinfo));
 
-		nf_conntrack_event_cache(IPCT_NATSEQADJ, skb);
+		nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
 	}
 	return 1;
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index aabddfe2127..df04de91e6e 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -93,7 +93,7 @@ static int icmpv6_packet(struct nf_conn *ct,
 			nf_ct_kill_acct(ct, ctinfo, skb);
 	} else {
 		atomic_inc(&ct->proto.icmp.count);
-		nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+		nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout);
 	}
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 251f020c7c1..01f59c57730 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -370,14 +370,14 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	spin_unlock_bh(&nf_conntrack_lock);
 	help = nfct_help(ct);
 	if (help && help->helper)
-		nf_conntrack_event_cache(IPCT_HELPER, skb);
+		nf_conntrack_event_cache(IPCT_HELPER, ct);
 #ifdef CONFIG_NF_NAT_NEEDED
 	if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
 	    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
-		nf_conntrack_event_cache(IPCT_NATINFO, skb);
+		nf_conntrack_event_cache(IPCT_NATINFO, ct);
 #endif
 	nf_conntrack_event_cache(master_ct(ct) ?
-				 IPCT_RELATED : IPCT_NEW, skb);
+				 IPCT_RELATED : IPCT_NEW, ct);
 	return NF_ACCEPT;
 
 out:
@@ -740,7 +740,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	}
 
 	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
-		nf_conntrack_event_cache(IPCT_STATUS, skb);
+		nf_conntrack_event_cache(IPCT_STATUS, ct);
 
 	return ret;
 }
@@ -853,7 +853,7 @@ acct:
 
 	/* must be unlocked when calling event cache */
 	if (event)
-		nf_conntrack_event_cache(event, skb);
+		nf_conntrack_event_cache(event, ct);
 }
 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
 
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index bb20672fe03..4f7107107e9 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -318,7 +318,8 @@ static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir)
 }
 
 /* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct nf_ct_ftp_master *info, int dir,
+static void update_nl_seq(struct nf_conn *ct, u32 nl_seq,
+			  struct nf_ct_ftp_master *info, int dir,
 			  struct sk_buff *skb)
 {
 	unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
@@ -336,11 +337,11 @@ static void update_nl_seq(u32 nl_seq, struct nf_ct_ftp_master *info, int dir,
 
 	if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
 		info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
-		nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+		nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, ct);
 	} else if (oldest != NUM_SEQ_TO_REMEMBER &&
 		   after(nl_seq, info->seq_aft_nl[dir][oldest])) {
 		info->seq_aft_nl[dir][oldest] = nl_seq;
-		nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+		nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, ct);
 	}
 }
 
@@ -509,7 +510,7 @@ out_update_nl:
 	/* Now if this ends in \n, update ftp info.  Seq may have been
 	 * adjusted by NAT code. */
 	if (ends_in_nl)
-		update_nl_seq(seq, ct_ftp_info, dir, skb);
+		update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
  out:
 	spin_unlock_bh(&nf_ftp_lock);
 	return ret;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index c5a78220fa3..5b1273a01fe 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -229,7 +229,7 @@ static int gre_packet(struct nf_conn *ct,
 				   ct->proto.gre.stream_timeout);
 		/* Also, more likely to be important, and not a probe. */
 		set_bit(IPS_ASSURED_BIT, &ct->status);
-		nf_conntrack_event_cache(IPCT_STATUS, skb);
+		nf_conntrack_event_cache(IPCT_STATUS, ct);
 	} else
 		nf_ct_refresh_acct(ct, ctinfo, skb,
 				   ct->proto.gre.timeout);
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index b5a90596d3f..ae8c2609e23 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -369,7 +369,7 @@ static int sctp_packet(struct nf_conn *ct,
 
 		ct->proto.sctp.state = new_state;
 		if (old_state != new_state)
-			nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
+			nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 	}
 	write_unlock_bh(&sctp_lock);
 
@@ -380,7 +380,7 @@ static int sctp_packet(struct nf_conn *ct,
 	    new_state == SCTP_CONNTRACK_ESTABLISHED) {
 		pr_debug("Setting assured bit\n");
 		set_bit(IPS_ASSURED_BIT, &ct->status);
-		nf_conntrack_event_cache(IPCT_STATUS, skb);
+		nf_conntrack_event_cache(IPCT_STATUS, ct);
 	}
 
 	return NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 4e71de2405f..b5d62d66e02 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -969,9 +969,9 @@ static int tcp_packet(struct nf_conn *ct,
 		timeout = tcp_timeouts[new_state];
 	write_unlock_bh(&tcp_lock);
 
-	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 	if (new_state != old_state)
-		nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 
 	if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
 		/* If only reply is a RST, we can consider ourselves not to
@@ -990,7 +990,7 @@ static int tcp_packet(struct nf_conn *ct,
 		   after SYN_RECV or a valid answer for a picked up
 		   connection. */
 		set_bit(IPS_ASSURED_BIT, &ct->status);
-		nf_conntrack_event_cache(IPCT_STATUS, skb);
+		nf_conntrack_event_cache(IPCT_STATUS, ct);
 	}
 	nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
 
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 8a245beb2c9..e0ee89e179c 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -75,7 +75,7 @@ static int udp_packet(struct nf_conn *ct,
 		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream);
 		/* Also, more likely to be important, and not a probe */
 		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
-			nf_conntrack_event_cache(IPCT_STATUS, skb);
+			nf_conntrack_event_cache(IPCT_STATUS, ct);
 	} else
 		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout);
 
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 981701919a7..c5b77c8f86c 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -75,7 +75,7 @@ static int udplite_packet(struct nf_conn *ct,
 				   nf_ct_udplite_timeout_stream);
 		/* Also, more likely to be important, and not a probe */
 		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
-			nf_conntrack_event_cache(IPCT_STATUS, skb);
+			nf_conntrack_event_cache(IPCT_STATUS, ct);
 	} else
 		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout);
 
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index e72e5d01752..e1415c3f5c9 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -54,7 +54,7 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in,
 			newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
 			if (newmark != ct->mark) {
 				ct->mark = newmark;
-				nf_conntrack_event_cache(IPCT_MARK, skb);
+				nf_conntrack_event_cache(IPCT_MARK, ct);
 			}
 			break;
 		case XT_CONNMARK_SAVE:
@@ -62,7 +62,7 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in,
 				  (skb->mark & markinfo->mask);
 			if (ct->mark != newmark) {
 				ct->mark = newmark;
-				nf_conntrack_event_cache(IPCT_MARK, skb);
+				nf_conntrack_event_cache(IPCT_MARK, ct);
 			}
 			break;
 		case XT_CONNMARK_RESTORE:
@@ -95,7 +95,7 @@ connmark_tg(struct sk_buff *skb, const struct net_device *in,
 		newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
 		if (ct->mark != newmark) {
 			ct->mark = newmark;
-			nf_conntrack_event_cache(IPCT_MARK, skb);
+			nf_conntrack_event_cache(IPCT_MARK, ct);
 		}
 		break;
 	case XT_CONNMARK_SAVE:
@@ -103,7 +103,7 @@ connmark_tg(struct sk_buff *skb, const struct net_device *in,
 		          (skb->mark & info->nfmask);
 		if (ct->mark != newmark) {
 			ct->mark = newmark;
-			nf_conntrack_event_cache(IPCT_MARK, skb);
+			nf_conntrack_event_cache(IPCT_MARK, ct);
 		}
 		break;
 	case XT_CONNMARK_RESTORE:
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index ae939e54dfa..5f221c3bd35 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -43,7 +43,7 @@ static void secmark_save(const struct sk_buff *skb)
 		ct = nf_ct_get(skb, &ctinfo);
 		if (ct && !ct->secmark) {
 			ct->secmark = skb->secmark;
-			nf_conntrack_event_cache(IPCT_SECMARK, skb);
+			nf_conntrack_event_cache(IPCT_SECMARK, ct);
 		}
 	}
 }
-- 
cgit v1.2.3


From 6058fa6bb96a5b6145cba10c5171f09c2783ca69 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:07 +0200
Subject: netfilter: netns nf_conntrack: per-netns event cache

Heh, last minute proof-reading of this patch made me think,
that this is actually unneeded, simply because "ct" pointers will be
different for different conntracks in different netns, just like they
are different in one netns.

Not so sure anymore.

[Patrick: pointers will be different, flushing can only be done while
 inactive though and thus it needs to be per netns]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c   | 12 +++++++++---
 net/netfilter/nf_conntrack_ecache.c | 26 +++++++++++++++++++-------
 2 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 01f59c57730..b55944e5e4e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1023,7 +1023,8 @@ void nf_conntrack_cleanup(struct net *net)
 	   delete... */
 	synchronize_net();
 
-	nf_ct_event_cache_flush();
+	nf_ct_event_cache_flush(net);
+	nf_conntrack_ecache_fini(net);
  i_see_dead_people:
 	nf_conntrack_flush(net);
 	if (atomic_read(&net->ct.count) != 0) {
@@ -1151,11 +1152,14 @@ int nf_conntrack_init(struct net *net)
 		max_factor = 4;
 	}
 	atomic_set(&net->ct.count, 0);
+	ret = nf_conntrack_ecache_init(net);
+	if (ret < 0)
+		goto err_ecache;
 	net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
 						  &net->ct.hash_vmalloc);
 	if (!net->ct.hash) {
 		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
-		goto err_out;
+		goto err_hash;
 	}
 	INIT_HLIST_HEAD(&net->ct.unconfirmed);
 
@@ -1215,6 +1219,8 @@ err_free_conntrack_slab:
 err_free_hash:
 	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
-err_out:
+err_hash:
+	nf_conntrack_ecache_fini(net);
+err_ecache:
 	return -ENOMEM;
 }
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 83c41ac3505..a5f5e2e65d1 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -29,9 +29,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_chain);
 ATOMIC_NOTIFIER_HEAD(nf_ct_expect_chain);
 EXPORT_SYMBOL_GPL(nf_ct_expect_chain);
 
-DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
-EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache);
-
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
 static inline void
@@ -51,10 +48,11 @@ __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
  * by code prior to async packet handling for freeing the skb */
 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_ecache *ecache;
 
 	local_bh_disable();
-	ecache = &__get_cpu_var(nf_conntrack_ecache);
+	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
 	if (ecache->ct == ct)
 		__nf_ct_deliver_cached_events(ecache);
 	local_bh_enable();
@@ -64,10 +62,11 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 /* Deliver cached events for old pending events, if current conntrack != old */
 void __nf_ct_event_cache_init(struct nf_conn *ct)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_ecache *ecache;
 
 	/* take care of delivering potentially old events */
-	ecache = &__get_cpu_var(nf_conntrack_ecache);
+	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
 	BUG_ON(ecache->ct == ct);
 	if (ecache->ct)
 		__nf_ct_deliver_cached_events(ecache);
@@ -79,18 +78,31 @@ EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init);
 
 /* flush the event cache - touches other CPU's data and must not be called
  * while packets are still passing through the code */
-void nf_ct_event_cache_flush(void)
+void nf_ct_event_cache_flush(struct net *net)
 {
 	struct nf_conntrack_ecache *ecache;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		ecache = &per_cpu(nf_conntrack_ecache, cpu);
+		ecache = per_cpu_ptr(net->ct.ecache, cpu);
 		if (ecache->ct)
 			nf_ct_put(ecache->ct);
 	}
 }
 
+int nf_conntrack_ecache_init(struct net *net)
+{
+	net->ct.ecache = alloc_percpu(struct nf_conntrack_ecache);
+	if (!net->ct.ecache)
+		return -ENOMEM;
+	return 0;
+}
+
+void nf_conntrack_ecache_fini(struct net *net)
+{
+	free_percpu(net->ct.ecache);
+}
+
 int nf_conntrack_register_notifier(struct notifier_block *nb)
 {
 	return atomic_notifier_chain_register(&nf_conntrack_chain, nb);
-- 
cgit v1.2.3


From 0d55af8791bfb42e04cc456b348910582f230343 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:07 +0200
Subject: netfilter: netns nf_conntrack: per-netns statistics

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c   |  4 +-
 net/netfilter/nf_conntrack_core.c                  | 49 ++++++++++++----------
 net/netfilter/nf_conntrack_expect.c                |  4 +-
 net/netfilter/nf_conntrack_standalone.c            |  4 +-
 4 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index b2940836d10..fdc85b37078 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -294,7 +294,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu+1;
-		return &per_cpu(nf_conntrack_stat, cpu);
+		return per_cpu_ptr(init_net.ct.stat, cpu);
 	}
 
 	return NULL;
@@ -308,7 +308,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu+1;
-		return &per_cpu(nf_conntrack_stat, cpu);
+		return per_cpu_ptr(init_net.ct.stat, cpu);
 	}
 
 	return NULL;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index b55944e5e4e..1e87fa0cd3a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -56,9 +56,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 unsigned int nf_ct_log_invalid __read_mostly;
 static struct kmem_cache *nf_conntrack_cachep __read_mostly;
 
-DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
-EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
-
 static int nf_conntrack_hash_rnd_initted;
 static unsigned int nf_conntrack_hash_rnd;
 
@@ -171,6 +168,7 @@ static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
 	struct nf_conn *ct = (struct nf_conn *)nfct;
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_l4proto *l4proto;
 
 	pr_debug("destroy_conntrack(%p)\n", ct);
@@ -203,7 +201,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 		hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 	}
 
-	NF_CT_STAT_INC(delete);
+	NF_CT_STAT_INC(net, delete);
 	spin_unlock_bh(&nf_conntrack_lock);
 
 	if (ct->master)
@@ -216,6 +214,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 static void death_by_timeout(unsigned long ul_conntrack)
 {
 	struct nf_conn *ct = (void *)ul_conntrack;
+	struct net *net = nf_ct_net(ct);
 	struct nf_conn_help *help = nfct_help(ct);
 	struct nf_conntrack_helper *helper;
 
@@ -230,7 +229,7 @@ static void death_by_timeout(unsigned long ul_conntrack)
 	spin_lock_bh(&nf_conntrack_lock);
 	/* Inside lock so preempt is disabled on module removal path.
 	 * Otherwise we can get spurious warnings. */
-	NF_CT_STAT_INC(delete_list);
+	NF_CT_STAT_INC(net, delete_list);
 	clean_from_lists(ct);
 	spin_unlock_bh(&nf_conntrack_lock);
 	nf_ct_put(ct);
@@ -249,11 +248,11 @@ __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple)
 	local_bh_disable();
 	hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) {
 		if (nf_ct_tuple_equal(tuple, &h->tuple)) {
-			NF_CT_STAT_INC(found);
+			NF_CT_STAT_INC(net, found);
 			local_bh_enable();
 			return h;
 		}
-		NF_CT_STAT_INC(searched);
+		NF_CT_STAT_INC(net, searched);
 	}
 	local_bh_enable();
 
@@ -366,7 +365,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	add_timer(&ct->timeout);
 	atomic_inc(&ct->ct_general.use);
 	set_bit(IPS_CONFIRMED_BIT, &ct->status);
-	NF_CT_STAT_INC(insert);
+	NF_CT_STAT_INC(net, insert);
 	spin_unlock_bh(&nf_conntrack_lock);
 	help = nfct_help(ct);
 	if (help && help->helper)
@@ -381,7 +380,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	return NF_ACCEPT;
 
 out:
-	NF_CT_STAT_INC(insert_failed);
+	NF_CT_STAT_INC(net, insert_failed);
 	spin_unlock_bh(&nf_conntrack_lock);
 	return NF_DROP;
 }
@@ -405,11 +404,11 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 	hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) {
 		if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
 		    nf_ct_tuple_equal(tuple, &h->tuple)) {
-			NF_CT_STAT_INC(found);
+			NF_CT_STAT_INC(net, found);
 			rcu_read_unlock_bh();
 			return 1;
 		}
-		NF_CT_STAT_INC(searched);
+		NF_CT_STAT_INC(net, searched);
 	}
 	rcu_read_unlock_bh();
 
@@ -454,7 +453,7 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 	if (del_timer(&ct->timeout)) {
 		death_by_timeout((unsigned long)ct);
 		dropped = 1;
-		NF_CT_STAT_INC_ATOMIC(early_drop);
+		NF_CT_STAT_INC_ATOMIC(net, early_drop);
 	}
 	nf_ct_put(ct);
 	return dropped;
@@ -581,7 +580,7 @@ init_conntrack(struct net *net,
 		ct->secmark = exp->master->secmark;
 #endif
 		nf_conntrack_get(&ct->master->ct_general);
-		NF_CT_STAT_INC(expect_new);
+		NF_CT_STAT_INC(net, expect_new);
 	} else {
 		struct nf_conntrack_helper *helper;
 
@@ -591,7 +590,7 @@ init_conntrack(struct net *net,
 			if (help)
 				rcu_assign_pointer(help->helper, helper);
 		}
-		NF_CT_STAT_INC(new);
+		NF_CT_STAT_INC(net, new);
 	}
 
 	/* Overload tuple linked list to put us in unconfirmed list. */
@@ -683,7 +682,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 	/* Previously seen (loopback or untracked)?  Ignore. */
 	if (skb->nfct) {
-		NF_CT_STAT_INC_ATOMIC(ignore);
+		NF_CT_STAT_INC_ATOMIC(net, ignore);
 		return NF_ACCEPT;
 	}
 
@@ -693,8 +692,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 				   &dataoff, &protonum);
 	if (ret <= 0) {
 		pr_debug("not prepared to track yet or error occured\n");
-		NF_CT_STAT_INC_ATOMIC(error);
-		NF_CT_STAT_INC_ATOMIC(invalid);
+		NF_CT_STAT_INC_ATOMIC(net, error);
+		NF_CT_STAT_INC_ATOMIC(net, invalid);
 		return -ret;
 	}
 
@@ -706,8 +705,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	if (l4proto->error != NULL) {
 		ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum);
 		if (ret <= 0) {
-			NF_CT_STAT_INC_ATOMIC(error);
-			NF_CT_STAT_INC_ATOMIC(invalid);
+			NF_CT_STAT_INC_ATOMIC(net, error);
+			NF_CT_STAT_INC_ATOMIC(net, invalid);
 			return -ret;
 		}
 	}
@@ -716,13 +715,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			       l3proto, l4proto, &set_reply, &ctinfo);
 	if (!ct) {
 		/* Not valid part of a connection */
-		NF_CT_STAT_INC_ATOMIC(invalid);
+		NF_CT_STAT_INC_ATOMIC(net, invalid);
 		return NF_ACCEPT;
 	}
 
 	if (IS_ERR(ct)) {
 		/* Too stressed to deal. */
-		NF_CT_STAT_INC_ATOMIC(drop);
+		NF_CT_STAT_INC_ATOMIC(net, drop);
 		return NF_DROP;
 	}
 
@@ -735,7 +734,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
 		nf_conntrack_put(skb->nfct);
 		skb->nfct = NULL;
-		NF_CT_STAT_INC_ATOMIC(invalid);
+		NF_CT_STAT_INC_ATOMIC(net, invalid);
 		return -ret;
 	}
 
@@ -1043,6 +1042,7 @@ void nf_conntrack_cleanup(struct net *net)
 
 	nf_conntrack_acct_fini();
 	nf_conntrack_expect_fini(net);
+	free_percpu(net->ct.stat);
 	nf_conntrack_helper_fini();
 	nf_conntrack_proto_fini();
 }
@@ -1152,6 +1152,9 @@ int nf_conntrack_init(struct net *net)
 		max_factor = 4;
 	}
 	atomic_set(&net->ct.count, 0);
+	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+	if (!net->ct.stat)
+		goto err_stat;
 	ret = nf_conntrack_ecache_init(net);
 	if (ret < 0)
 		goto err_ecache;
@@ -1222,5 +1225,7 @@ err_free_hash:
 err_hash:
 	nf_conntrack_ecache_fini(net);
 err_ecache:
+	free_percpu(net->ct.stat);
+err_stat:
 	return -ENOMEM;
 }
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 6a09200049e..b7f75117161 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -53,7 +53,7 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
 	master_help->expecting[exp->class]--;
 	nf_ct_expect_put(exp);
 
-	NF_CT_STAT_INC(expect_delete);
+	NF_CT_STAT_INC(net, expect_delete);
 }
 EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
 
@@ -326,7 +326,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	add_timer(&exp->timeout);
 
 	atomic_inc(&exp->use);
-	NF_CT_STAT_INC(expect_create);
+	NF_CT_STAT_INC(net, expect_create);
 }
 
 /* Race with expectations being used means we could have none to find; OK. */
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 02eaf872277..a4fdbbf363a 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -203,7 +203,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu + 1;
-		return &per_cpu(nf_conntrack_stat, cpu);
+		return per_cpu_ptr(init_net.ct.stat, cpu);
 	}
 
 	return NULL;
@@ -217,7 +217,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu + 1;
-		return &per_cpu(nf_conntrack_stat, cpu);
+		return per_cpu_ptr(init_net.ct.stat, cpu);
 	}
 
 	return NULL;
-- 
cgit v1.2.3


From 8e9df80180b73d4107bf8fbf28b1633c541d2770 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:08 +0200
Subject: netfilter: netns nf_conntrack: per-netns /proc/net/stat/nf_conntrack,
 /proc/net/stat/ip_conntrack

Show correct conntrack count, while I'm at it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 14 +++++++++-----
 net/netfilter/nf_conntrack_standalone.c               | 14 +++++++++-----
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index fdc85b37078..313ebf00ee3 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -285,6 +285,7 @@ static const struct file_operations ip_exp_file_ops = {
 
 static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 {
+	struct net *net = seq_file_net(seq);
 	int cpu;
 
 	if (*pos == 0)
@@ -294,7 +295,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu+1;
-		return per_cpu_ptr(init_net.ct.stat, cpu);
+		return per_cpu_ptr(net->ct.stat, cpu);
 	}
 
 	return NULL;
@@ -302,13 +303,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct net *net = seq_file_net(seq);
 	int cpu;
 
 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu+1;
-		return per_cpu_ptr(init_net.ct.stat, cpu);
+		return per_cpu_ptr(net->ct.stat, cpu);
 	}
 
 	return NULL;
@@ -320,7 +322,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
 
 static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 {
-	unsigned int nr_conntracks = atomic_read(&init_net.ct.count);
+	struct net *net = seq_file_net(seq);
+	unsigned int nr_conntracks = atomic_read(&net->ct.count);
 	const struct ip_conntrack_stat *st = v;
 
 	if (v == SEQ_START_TOKEN) {
@@ -360,7 +363,8 @@ static const struct seq_operations ct_cpu_seq_ops = {
 
 static int ct_cpu_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &ct_cpu_seq_ops);
+	return seq_open_net(inode, file, &ct_cpu_seq_ops,
+			    sizeof(struct seq_net_private));
 }
 
 static const struct file_operations ct_cpu_seq_fops = {
@@ -368,7 +372,7 @@ static const struct file_operations ct_cpu_seq_fops = {
 	.open    = ct_cpu_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_net,
 };
 
 static int __net_init ip_conntrack_net_init(struct net *net)
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index a4fdbbf363a..169760ddc16 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -194,6 +194,7 @@ static const struct file_operations ct_file_ops = {
 
 static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 {
+	struct net *net = seq_file_net(seq);
 	int cpu;
 
 	if (*pos == 0)
@@ -203,7 +204,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu + 1;
-		return per_cpu_ptr(init_net.ct.stat, cpu);
+		return per_cpu_ptr(net->ct.stat, cpu);
 	}
 
 	return NULL;
@@ -211,13 +212,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct net *net = seq_file_net(seq);
 	int cpu;
 
 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu + 1;
-		return per_cpu_ptr(init_net.ct.stat, cpu);
+		return per_cpu_ptr(net->ct.stat, cpu);
 	}
 
 	return NULL;
@@ -229,7 +231,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
 
 static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 {
-	unsigned int nr_conntracks = atomic_read(&init_net.ct.count);
+	struct net *net = seq_file_net(seq);
+	unsigned int nr_conntracks = atomic_read(&net->ct.count);
 	const struct ip_conntrack_stat *st = v;
 
 	if (v == SEQ_START_TOKEN) {
@@ -269,7 +272,8 @@ static const struct seq_operations ct_cpu_seq_ops = {
 
 static int ct_cpu_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &ct_cpu_seq_ops);
+	return seq_open_net(inode, file, &ct_cpu_seq_ops,
+			    sizeof(struct seq_net_private));
 }
 
 static const struct file_operations ct_cpu_seq_fops = {
@@ -277,7 +281,7 @@ static const struct file_operations ct_cpu_seq_fops = {
 	.open	 = ct_cpu_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_net,
 };
 
 static int nf_conntrack_standalone_init_proc(struct net *net)
-- 
cgit v1.2.3


From 802507071b72ed5025747126099cbc6d1542f596 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:08 +0200
Subject: netfilter: netns nf_conntrack: per-netns
 net.netfilter.nf_conntrack_count sysctl

Note, sysctl table is always duplicated, this is simpler and less
special-cased.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_standalone.c | 73 ++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 169760ddc16..64b4f95b367 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -330,7 +330,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_checksum);
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
-static struct ctl_table_header *nf_ct_sysctl_header;
 static struct ctl_table_header *nf_ct_netfilter_header;
 
 static ctl_table nf_ct_sysctl_table[] = {
@@ -409,40 +408,58 @@ static struct ctl_path nf_ct_path[] = {
 
 EXPORT_SYMBOL_GPL(nf_ct_log_invalid);
 
-static int nf_conntrack_standalone_init_sysctl(void)
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
-	nf_ct_netfilter_header =
-		register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table);
-	if (!nf_ct_netfilter_header)
-		goto out;
-
-	nf_ct_sysctl_header =
-		 register_sysctl_paths(nf_net_netfilter_sysctl_path,
-					nf_ct_sysctl_table);
-	if (!nf_ct_sysctl_header)
+	struct ctl_table *table;
+
+	if (net_eq(net, &init_net)) {
+		nf_ct_netfilter_header =
+		       register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table);
+		if (!nf_ct_netfilter_header)
+			goto out;
+	}
+
+	table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out_kmemdup;
+
+	table[1].data = &net->ct.count;
+
+	net->ct.sysctl_header = register_net_sysctl_table(net,
+					nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.sysctl_header)
 		goto out_unregister_netfilter;
 
 	return 0;
 
 out_unregister_netfilter:
-	unregister_sysctl_table(nf_ct_netfilter_header);
+	kfree(table);
+out_kmemdup:
+	if (net_eq(net, &init_net))
+		unregister_sysctl_table(nf_ct_netfilter_header);
 out:
 	printk("nf_conntrack: can't register to sysctl.\n");
 	return -ENOMEM;
 }
 
-static void nf_conntrack_standalone_fini_sysctl(void)
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
 {
-	unregister_sysctl_table(nf_ct_netfilter_header);
-	unregister_sysctl_table(nf_ct_sysctl_header);
+	struct ctl_table *table;
+
+	if (net_eq(net, &init_net))
+		unregister_sysctl_table(nf_ct_netfilter_header);
+	table = net->ct.sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.sysctl_header);
+	kfree(table);
 }
 #else
-static int nf_conntrack_standalone_init_sysctl(void)
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	return 0;
 }
 
-static void nf_conntrack_standalone_fini_sysctl(void)
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
 {
 }
 #endif /* CONFIG_SYSCTL */
@@ -457,8 +474,13 @@ static int nf_conntrack_net_init(struct net *net)
 	ret = nf_conntrack_standalone_init_proc(net);
 	if (ret < 0)
 		goto out_proc;
+	ret = nf_conntrack_standalone_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
 	return 0;
 
+out_sysctl:
+	nf_conntrack_standalone_fini_proc(net);
 out_proc:
 	nf_conntrack_cleanup(net);
 out_init:
@@ -467,6 +489,7 @@ out_init:
 
 static void nf_conntrack_net_exit(struct net *net)
 {
+	nf_conntrack_standalone_fini_sysctl(net);
 	nf_conntrack_standalone_fini_proc(net);
 	nf_conntrack_cleanup(net);
 }
@@ -478,25 +501,11 @@ static struct pernet_operations nf_conntrack_net_ops = {
 
 static int __init nf_conntrack_standalone_init(void)
 {
-	int ret;
-
-	ret = register_pernet_subsys(&nf_conntrack_net_ops);
-	if (ret < 0)
-		goto out;
-	ret = nf_conntrack_standalone_init_sysctl();
-	if (ret < 0)
-		goto out_sysctl;
-	return 0;
-
-out_sysctl:
-	unregister_pernet_subsys(&nf_conntrack_net_ops);
-out:
-	return ret;
+	return register_pernet_subsys(&nf_conntrack_net_ops);
 }
 
 static void __exit nf_conntrack_standalone_fini(void)
 {
-	nf_conntrack_standalone_fini_sysctl();
 	unregister_pernet_subsys(&nf_conntrack_net_ops);
 }
 
-- 
cgit v1.2.3


From c04d05529a6e0bf97183a2caf76a0c7f07f5b78c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:08 +0200
Subject: netfilter: netns nf_conntrack: per-netns
 net.netfilter.nf_conntrack_checksum sysctl

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +-
 net/netfilter/nf_conntrack_proto_dccp.c        | 2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         | 2 +-
 net/netfilter/nf_conntrack_proto_udp.c         | 2 +-
 net/netfilter/nf_conntrack_proto_udplite.c     | 2 +-
 net/netfilter/nf_conntrack_standalone.c        | 7 +++----
 8 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 2e4dd3fb002..75871b1dd8a 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -270,7 +270,7 @@ static ctl_table ip_ct_sysctl_table[] = {
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_CHECKSUM,
 		.procname	= "ip_conntrack_checksum",
-		.data		= &nf_conntrack_checksum,
+		.data		= &init_net.ct.sysctl_checksum,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 205ba399d4a..ace66cbf921 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -188,7 +188,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index df04de91e6e..fa12e57749a 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -187,7 +187,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 		return -NF_ACCEPT;
 	}
 
-	if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
 		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
 			      "nf_ct_icmpv6: ICMPv6 checksum failed\n");
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 6ead8da3e9e..769680e68b5 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -575,7 +575,7 @@ static int dccp_error(struct net *net, struct sk_buff *skb,
 		}
 	}
 
-	if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP,
 				pf)) {
 		msg = "nf_ct_dccp: bad checksum ";
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b5d62d66e02..131c9be4470 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -780,7 +780,7 @@ static int tcp_error(struct net *net,
 	 * because the checksum is assumed to be correct.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index e0ee89e179c..3d3fffe3f8b 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -123,7 +123,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	 * We skip checking packets on the outgoing path
 	 * because the checksum is assumed to be correct.
 	 * FIXME: Source route IP option packets --RR */
-	if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
 		if (LOG_INVALID(IPPROTO_UDP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index c5b77c8f86c..3d1697c4f91 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -129,7 +129,7 @@ static int udplite_error(struct net *net,
 	}
 
 	/* Checksum invalid? Ignore. */
-	if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
 	    			pf)) {
 		if (LOG_INVALID(IPPROTO_UDPLITE))
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 64b4f95b367..5cd06637977 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -322,9 +322,6 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
 
 /* Sysctl support */
 
-int nf_conntrack_checksum __read_mostly = 1;
-EXPORT_SYMBOL_GPL(nf_conntrack_checksum);
-
 #ifdef CONFIG_SYSCTL
 /* Log invalid packets of a given protocol */
 static int log_invalid_proto_min = 0;
@@ -360,7 +357,7 @@ static ctl_table nf_ct_sysctl_table[] = {
 	{
 		.ctl_name	= NET_NF_CONNTRACK_CHECKSUM,
 		.procname	= "nf_conntrack_checksum",
-		.data		= &nf_conntrack_checksum,
+		.data		= &init_net.ct.sysctl_checksum,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
@@ -425,6 +422,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 		goto out_kmemdup;
 
 	table[1].data = &net->ct.count;
+	table[3].data = &net->ct.sysctl_checksum;
 
 	net->ct.sysctl_header = register_net_sysctl_table(net,
 					nf_net_netfilter_sysctl_path, table);
@@ -474,6 +472,7 @@ static int nf_conntrack_net_init(struct net *net)
 	ret = nf_conntrack_standalone_init_proc(net);
 	if (ret < 0)
 		goto out_proc;
+	net->ct.sysctl_checksum = 1;
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
 		goto out_sysctl;
-- 
cgit v1.2.3


From c2a2c7e0cc39e7f9336cd67e8307a110bdba82f3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:08 +0200
Subject: netfilter: netns nf_conntrack: per-netns
 net.netfilter.nf_conntrack_log_invalid sysctl

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  6 +++---
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_core.c              |  1 -
 net/netfilter/nf_conntrack_proto_dccp.c        | 10 ++++++----
 net/netfilter/nf_conntrack_proto_tcp.c         | 18 ++++++++++--------
 net/netfilter/nf_conntrack_proto_udp.c         |  6 +++---
 net/netfilter/nf_conntrack_proto_udplite.c     |  8 ++++----
 net/netfilter/nf_conntrack_standalone.c        |  6 +++---
 9 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 75871b1dd8a..af69acc1d0f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -278,7 +278,7 @@ static ctl_table ip_ct_sysctl_table[] = {
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_LOG_INVALID,
 		.procname	= "ip_conntrack_log_invalid",
-		.data		= &nf_ct_log_invalid,
+		.data		= &init_net.ct.sysctl_log_invalid,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index ace66cbf921..4e887922022 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -181,7 +181,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	/* Not enough header? */
 	icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
 	if (icmph == NULL) {
-		if (LOG_INVALID(IPPROTO_ICMP))
+		if (LOG_INVALID(net, IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_icmp: short packet ");
 		return -NF_ACCEPT;
@@ -190,7 +190,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	/* See ip_conntrack_proto_tcp.c */
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
-		if (LOG_INVALID(IPPROTO_ICMP))
+		if (LOG_INVALID(net, IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_icmp: bad HW ICMP checksum ");
 		return -NF_ACCEPT;
@@ -203,7 +203,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	 *		  discarded.
 	 */
 	if (icmph->type > NR_ICMP_TYPES) {
-		if (LOG_INVALID(IPPROTO_ICMP))
+		if (LOG_INVALID(net, IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_icmp: invalid ICMP type ");
 		return -NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index fa12e57749a..05726177903 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -181,7 +181,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 
 	icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
 	if (icmp6h == NULL) {
-		if (LOG_INVALID(IPPROTO_ICMPV6))
+		if (LOG_INVALID(net, IPPROTO_ICMPV6))
 		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
 			      "nf_ct_icmpv6: short packet ");
 		return -NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1e87fa0cd3a..ade0bb3ab2e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -53,7 +53,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
 struct nf_conn nf_conntrack_untracked __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 
-unsigned int nf_ct_log_invalid __read_mostly;
 static struct kmem_cache *nf_conntrack_cachep __read_mostly;
 
 static int nf_conntrack_hash_rnd_initted;
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 769680e68b5..8fcf1762fab 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -418,6 +418,7 @@ static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
 static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		     unsigned int dataoff)
 {
+	struct net *net = nf_ct_net(ct);
 	struct dccp_hdr _dh, *dh;
 	const char *msg;
 	u_int8_t state;
@@ -445,7 +446,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 
 out_invalid:
-	if (LOG_INVALID(IPPROTO_DCCP))
+	if (LOG_INVALID(net, IPPROTO_DCCP))
 		nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg);
 	return false;
 }
@@ -463,6 +464,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
 		       u_int8_t pf, unsigned int hooknum)
 {
+	struct net *net = nf_ct_net(ct);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	struct dccp_hdr _dh, *dh;
 	u_int8_t type, old_state, new_state;
@@ -524,13 +526,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		ct->proto.dccp.last_pkt = type;
 
 		write_unlock_bh(&dccp_lock);
-		if (LOG_INVALID(IPPROTO_DCCP))
+		if (LOG_INVALID(net, IPPROTO_DCCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_dccp: invalid packet ignored ");
 		return NF_ACCEPT;
 	case CT_DCCP_INVALID:
 		write_unlock_bh(&dccp_lock);
-		if (LOG_INVALID(IPPROTO_DCCP))
+		if (LOG_INVALID(net, IPPROTO_DCCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_dccp: invalid state transition ");
 		return -NF_ACCEPT;
@@ -590,7 +592,7 @@ static int dccp_error(struct net *net, struct sk_buff *skb,
 	return NF_ACCEPT;
 
 out_invalid:
-	if (LOG_INVALID(IPPROTO_DCCP))
+	if (LOG_INVALID(net, IPPROTO_DCCP))
 		nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg);
 	return -NF_ACCEPT;
 }
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 131c9be4470..f947ec41e39 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -488,6 +488,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			  const struct tcphdr *tcph,
 			  u_int8_t pf)
 {
+	struct net *net = nf_ct_net(ct);
 	struct ip_ct_tcp_state *sender = &state->seen[dir];
 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
 	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
@@ -668,7 +669,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
 		    nf_ct_tcp_be_liberal)
 			res = true;
-		if (!res && LOG_INVALID(IPPROTO_TCP))
+		if (!res && LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 			"nf_ct_tcp: %s ",
 			before(seq, sender->td_maxend + 1) ?
@@ -761,7 +762,7 @@ static int tcp_error(struct net *net,
 	/* Smaller that minimal TCP header? */
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
-		if (LOG_INVALID(IPPROTO_TCP))
+		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_tcp: short packet ");
 		return -NF_ACCEPT;
@@ -769,7 +770,7 @@ static int tcp_error(struct net *net,
 
 	/* Not whole TCP header or malformed packet */
 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
-		if (LOG_INVALID(IPPROTO_TCP))
+		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_tcp: truncated/malformed packet ");
 		return -NF_ACCEPT;
@@ -782,7 +783,7 @@ static int tcp_error(struct net *net,
 	/* FIXME: Source route IP option packets --RR */
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
-		if (LOG_INVALID(IPPROTO_TCP))
+		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: bad TCP checksum ");
 		return -NF_ACCEPT;
@@ -791,7 +792,7 @@ static int tcp_error(struct net *net,
 	/* Check TCP flags. */
 	tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH));
 	if (!tcp_valid_flags[tcpflags]) {
-		if (LOG_INVALID(IPPROTO_TCP))
+		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid TCP flag combination ");
 		return -NF_ACCEPT;
@@ -808,6 +809,7 @@ static int tcp_packet(struct nf_conn *ct,
 		      u_int8_t pf,
 		      unsigned int hooknum)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple *tuple;
 	enum tcp_conntrack new_state, old_state;
 	enum ip_conntrack_dir dir;
@@ -886,7 +888,7 @@ static int tcp_packet(struct nf_conn *ct,
 			 * thus initiate a clean new session.
 			 */
 			write_unlock_bh(&tcp_lock);
-			if (LOG_INVALID(IPPROTO_TCP))
+			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
 			nf_ct_kill(ct);
@@ -899,7 +901,7 @@ static int tcp_packet(struct nf_conn *ct,
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
 		write_unlock_bh(&tcp_lock);
-		if (LOG_INVALID(IPPROTO_TCP))
+		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
 		return NF_ACCEPT;
@@ -908,7 +910,7 @@ static int tcp_packet(struct nf_conn *ct,
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
 		write_unlock_bh(&tcp_lock);
-		if (LOG_INVALID(IPPROTO_TCP))
+		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
 		return -NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 3d3fffe3f8b..7c2ca48698b 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -101,7 +101,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	/* Header is too small? */
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
-		if (LOG_INVALID(IPPROTO_UDP))
+		if (LOG_INVALID(net, IPPROTO_UDP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udp: short packet ");
 		return -NF_ACCEPT;
@@ -109,7 +109,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 
 	/* Truncated/malformed packets */
 	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
-		if (LOG_INVALID(IPPROTO_UDP))
+		if (LOG_INVALID(net, IPPROTO_UDP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_udp: truncated/malformed packet ");
 		return -NF_ACCEPT;
@@ -125,7 +125,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 	 * FIXME: Source route IP option packets --RR */
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
-		if (LOG_INVALID(IPPROTO_UDP))
+		if (LOG_INVALID(net, IPPROTO_UDP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_udp: bad UDP checksum ");
 		return -NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 3d1697c4f91..d22d839e4f9 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -104,7 +104,7 @@ static int udplite_error(struct net *net,
 	/* Header is too small? */
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
-		if (LOG_INVALID(IPPROTO_UDPLITE))
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udplite: short packet ");
 		return -NF_ACCEPT;
@@ -114,7 +114,7 @@ static int udplite_error(struct net *net,
 	if (cscov == 0)
 		cscov = udplen;
 	else if (cscov < sizeof(*hdr) || cscov > udplen) {
-		if (LOG_INVALID(IPPROTO_UDPLITE))
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_udplite: invalid checksum coverage ");
 		return -NF_ACCEPT;
@@ -122,7 +122,7 @@ static int udplite_error(struct net *net,
 
 	/* UDPLITE mandates checksums */
 	if (!hdr->check) {
-		if (LOG_INVALID(IPPROTO_UDPLITE))
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udplite: checksum missing ");
 		return -NF_ACCEPT;
@@ -132,7 +132,7 @@ static int udplite_error(struct net *net,
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
 	    			pf)) {
-		if (LOG_INVALID(IPPROTO_UDPLITE))
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udplite: bad UDPLite checksum ");
 		return -NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5cd06637977..98106d4e89f 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -365,7 +365,7 @@ static ctl_table nf_ct_sysctl_table[] = {
 	{
 		.ctl_name	= NET_NF_CONNTRACK_LOG_INVALID,
 		.procname	= "nf_conntrack_log_invalid",
-		.data		= &nf_ct_log_invalid,
+		.data		= &init_net.ct.sysctl_log_invalid,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
@@ -403,8 +403,6 @@ static struct ctl_path nf_ct_path[] = {
 	{ }
 };
 
-EXPORT_SYMBOL_GPL(nf_ct_log_invalid);
-
 static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	struct ctl_table *table;
@@ -423,6 +421,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[1].data = &net->ct.count;
 	table[3].data = &net->ct.sysctl_checksum;
+	table[4].data = &net->ct.sysctl_log_invalid;
 
 	net->ct.sysctl_header = register_net_sysctl_table(net,
 					nf_net_netfilter_sysctl_path, table);
@@ -473,6 +472,7 @@ static int nf_conntrack_net_init(struct net *net)
 	if (ret < 0)
 		goto out_proc;
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_log_invalid = 0;
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
 		goto out_sysctl;
-- 
cgit v1.2.3


From d716a4dfbbdf0d4731d596a96e5f4b0d892ac168 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:09 +0200
Subject: netfilter: netns nf_conntrack: per-netns conntrack accounting

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_acct.c | 100 +++++++++++++++++++++++++++-----------
 net/netfilter/nf_conntrack_core.c |   4 +-
 2 files changed, 74 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 59bd8b903a1..03591d37b9c 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -22,19 +22,17 @@
 #define NF_CT_ACCT_DEFAULT 0
 #endif
 
-int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT;
-EXPORT_SYMBOL_GPL(nf_ct_acct);
+static int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT;
 
 module_param_named(acct, nf_ct_acct, bool, 0644);
 MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table_header *acct_sysctl_header;
 static struct ctl_table acct_sysctl_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nf_conntrack_acct",
-		.data		= &nf_ct_acct,
+		.data		= &init_net.ct.sysctl_acct,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
@@ -64,41 +62,87 @@ static struct nf_ct_ext_type acct_extend __read_mostly = {
 	.id	= NF_CT_EXT_ACCT,
 };
 
-int nf_conntrack_acct_init(void)
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_acct_init_sysctl(struct net *net)
 {
-	int ret;
+	struct ctl_table *table;
 
-#ifdef CONFIG_NF_CT_ACCT
-	printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n");
-	printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n");
-	printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n");
-#endif
+	table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	table[0].data = &net->ct.sysctl_acct;
 
-	ret = nf_ct_extend_register(&acct_extend);
-	if (ret < 0) {
-		printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n");
-		return ret;
+	net->ct.acct_sysctl_header = register_net_sysctl_table(net,
+			nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.acct_sysctl_header) {
+		printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");
+		goto out_register;
 	}
+	return 0;
 
-#ifdef CONFIG_SYSCTL
-	acct_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
-				acct_sysctl_table);
+out_register:
+	kfree(table);
+out:
+	return -ENOMEM;
+}
 
-	if (!acct_sysctl_header) {
-		nf_ct_extend_unregister(&acct_extend);
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
 
-		printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");
-		return -ENOMEM;
-	}
+	table = net->ct.acct_sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.acct_sysctl_header);
+	kfree(table);
+}
+#else
+static int nf_conntrack_acct_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_acct_init(struct net *net)
+{
+	int ret;
+
+	net->ct.sysctl_acct = nf_ct_acct;
+
+	if (net_eq(net, &init_net)) {
+#ifdef CONFIG_NF_CT_ACCT
+		printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n");
+		printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n");
+		printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n");
 #endif
 
+		ret = nf_ct_extend_register(&acct_extend);
+		if (ret < 0) {
+			printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n");
+			goto out_extend_register;
+		}
+	}
+
+	ret = nf_conntrack_acct_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
+
 	return 0;
+
+out_sysctl:
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&acct_extend);
+out_extend_register:
+	return ret;
 }
 
-void nf_conntrack_acct_fini(void)
+void nf_conntrack_acct_fini(struct net *net)
 {
-#ifdef CONFIG_SYSCTL
-	unregister_sysctl_table(acct_sysctl_header);
-#endif
-	nf_ct_extend_unregister(&acct_extend);
+	nf_conntrack_acct_fini_sysctl(net);
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&acct_extend);
 }
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ade0bb3ab2e..bb26460d897 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1039,7 +1039,7 @@ void nf_conntrack_cleanup(struct net *net)
 	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
 
-	nf_conntrack_acct_fini();
+	nf_conntrack_acct_fini(net);
 	nf_conntrack_expect_fini(net);
 	free_percpu(net->ct.stat);
 	nf_conntrack_helper_fini();
@@ -1191,7 +1191,7 @@ int nf_conntrack_init(struct net *net)
 	if (ret < 0)
 		goto out_fini_expect;
 
-	ret = nf_conntrack_acct_init();
+	ret = nf_conntrack_acct_init(net);
 	if (ret < 0)
 		goto out_fini_helper;
 
-- 
cgit v1.2.3


From 08f6547d266fdba087f7fa7963fc0610be5b7cd7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:09 +0200
Subject: netfilter: netns nf_conntrack: final netns tweaks

Add init_net checks to not remove kmem_caches twice and so on.

Refactor functions to split code which should be executed only for
init_net into one place.

ip_ct_attach and ip_ct_destroy assignments remain separate, because
they're separate stages in setup and teardown.

NOTE: NOTRACK code is in for-every-net part. It will be made per-netns
after we decidce how to do it correctly.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c   | 151 +++++++++++++++++++++++-------------
 net/netfilter/nf_conntrack_expect.c |  26 ++++---
 2 files changed, 114 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index bb26460d897..27de3c7b006 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1010,17 +1010,15 @@ void nf_conntrack_flush(struct net *net)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush);
 
-/* Mishearing the voices in his head, our hero wonders how he's
-   supposed to kill the mall. */
-void nf_conntrack_cleanup(struct net *net)
+static void nf_conntrack_cleanup_init_net(void)
 {
-	rcu_assign_pointer(ip_ct_attach, NULL);
-
-	/* This makes sure all current packets have passed through
-	   netfilter framework.  Roll on, two-stage module
-	   delete... */
-	synchronize_net();
+	nf_conntrack_helper_fini();
+	nf_conntrack_proto_fini();
+	kmem_cache_destroy(nf_conntrack_cachep);
+}
 
+static void nf_conntrack_cleanup_net(struct net *net)
+{
 	nf_ct_event_cache_flush(net);
 	nf_conntrack_ecache_fini(net);
  i_see_dead_people:
@@ -1033,17 +1031,31 @@ void nf_conntrack_cleanup(struct net *net)
 	while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
 		schedule();
 
-	rcu_assign_pointer(nf_ct_destroy, NULL);
-
-	kmem_cache_destroy(nf_conntrack_cachep);
 	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
-
 	nf_conntrack_acct_fini(net);
 	nf_conntrack_expect_fini(net);
 	free_percpu(net->ct.stat);
-	nf_conntrack_helper_fini();
-	nf_conntrack_proto_fini();
+}
+
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void nf_conntrack_cleanup(struct net *net)
+{
+	if (net_eq(net, &init_net))
+		rcu_assign_pointer(ip_ct_attach, NULL);
+
+	/* This makes sure all current packets have passed through
+	   netfilter framework.  Roll on, two-stage module
+	   delete... */
+	synchronize_net();
+
+	nf_conntrack_cleanup_net(net);
+
+	if (net_eq(net, &init_net)) {
+		rcu_assign_pointer(nf_ct_destroy, NULL);
+		nf_conntrack_cleanup_init_net();
+	}
 }
 
 struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced)
@@ -1128,7 +1140,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
-int nf_conntrack_init(struct net *net)
+static int nf_conntrack_init_init_net(void)
 {
 	int max_factor = 8;
 	int ret;
@@ -1150,21 +1162,6 @@ int nf_conntrack_init(struct net *net)
 		 * entries. */
 		max_factor = 4;
 	}
-	atomic_set(&net->ct.count, 0);
-	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
-	if (!net->ct.stat)
-		goto err_stat;
-	ret = nf_conntrack_ecache_init(net);
-	if (ret < 0)
-		goto err_ecache;
-	net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
-						  &net->ct.hash_vmalloc);
-	if (!net->ct.hash) {
-		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
-		goto err_hash;
-	}
-	INIT_HLIST_HEAD(&net->ct.unconfirmed);
-
 	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
 
 	printk("nf_conntrack version %s (%u buckets, %d max)\n",
@@ -1176,28 +1173,55 @@ int nf_conntrack_init(struct net *net)
 						0, 0, NULL);
 	if (!nf_conntrack_cachep) {
 		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
-		goto err_free_hash;
+		ret = -ENOMEM;
+		goto err_cache;
 	}
 
 	ret = nf_conntrack_proto_init();
 	if (ret < 0)
-		goto err_free_conntrack_slab;
-
-	ret = nf_conntrack_expect_init(net);
-	if (ret < 0)
-		goto out_fini_proto;
+		goto err_proto;
 
 	ret = nf_conntrack_helper_init();
 	if (ret < 0)
-		goto out_fini_expect;
+		goto err_helper;
+
+	return 0;
+
+err_helper:
+	nf_conntrack_proto_fini();
+err_proto:
+	kmem_cache_destroy(nf_conntrack_cachep);
+err_cache:
+	return ret;
+}
+
+static int nf_conntrack_init_net(struct net *net)
+{
+	int ret;
 
+	atomic_set(&net->ct.count, 0);
+	INIT_HLIST_HEAD(&net->ct.unconfirmed);
+	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+	if (!net->ct.stat) {
+		ret = -ENOMEM;
+		goto err_stat;
+	}
+	ret = nf_conntrack_ecache_init(net);
+	if (ret < 0)
+		goto err_ecache;
+	net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
+						  &net->ct.hash_vmalloc);
+	if (!net->ct.hash) {
+		ret = -ENOMEM;
+		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+		goto err_hash;
+	}
+	ret = nf_conntrack_expect_init(net);
+	if (ret < 0)
+		goto err_expect;
 	ret = nf_conntrack_acct_init(net);
 	if (ret < 0)
-		goto out_fini_helper;
-
-	/* For use by REJECT target */
-	rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
-	rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
+		goto err_acct;
 
 	/* Set up fake conntrack:
 	    - to never be deleted, not in any hashes */
@@ -1208,17 +1232,11 @@ int nf_conntrack_init(struct net *net)
 	/*  - and look it like as a confirmed connection */
 	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
 
-	return ret;
+	return 0;
 
-out_fini_helper:
-	nf_conntrack_helper_fini();
-out_fini_expect:
+err_acct:
 	nf_conntrack_expect_fini(net);
-out_fini_proto:
-	nf_conntrack_proto_fini();
-err_free_conntrack_slab:
-	kmem_cache_destroy(nf_conntrack_cachep);
-err_free_hash:
+err_expect:
 	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
 err_hash:
@@ -1226,5 +1244,32 @@ err_hash:
 err_ecache:
 	free_percpu(net->ct.stat);
 err_stat:
-	return -ENOMEM;
+	return ret;
+}
+
+int nf_conntrack_init(struct net *net)
+{
+	int ret;
+
+	if (net_eq(net, &init_net)) {
+		ret = nf_conntrack_init_init_net();
+		if (ret < 0)
+			goto out_init_net;
+	}
+	ret = nf_conntrack_init_net(net);
+	if (ret < 0)
+		goto out_net;
+
+	if (net_eq(net, &init_net)) {
+		/* For use by REJECT target */
+		rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
+		rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
+	}
+	return 0;
+
+out_net:
+	if (net_eq(net, &init_net))
+		nf_conntrack_cleanup_init_net();
+out_init_net:
+	return ret;
 }
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index b7f75117161..37a703bc3b8 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -563,12 +563,14 @@ int nf_conntrack_expect_init(struct net *net)
 {
 	int err = -ENOMEM;
 
-	if (!nf_ct_expect_hsize) {
-		nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
-		if (!nf_ct_expect_hsize)
-			nf_ct_expect_hsize = 1;
+	if (net_eq(net, &init_net)) {
+		if (!nf_ct_expect_hsize) {
+			nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
+			if (!nf_ct_expect_hsize)
+				nf_ct_expect_hsize = 1;
+		}
+		nf_ct_expect_max = nf_ct_expect_hsize * 4;
 	}
-	nf_ct_expect_max = nf_ct_expect_hsize * 4;
 
 	net->ct.expect_count = 0;
 	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
@@ -576,11 +578,13 @@ int nf_conntrack_expect_init(struct net *net)
 	if (net->ct.expect_hash == NULL)
 		goto err1;
 
-	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
+	if (net_eq(net, &init_net)) {
+		nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
 					sizeof(struct nf_conntrack_expect),
 					0, 0, NULL);
-	if (!nf_ct_expect_cachep)
-		goto err2;
+		if (!nf_ct_expect_cachep)
+			goto err2;
+	}
 
 	err = exp_proc_init(net);
 	if (err < 0)
@@ -589,7 +593,8 @@ int nf_conntrack_expect_init(struct net *net)
 	return 0;
 
 err3:
-	kmem_cache_destroy(nf_ct_expect_cachep);
+	if (net_eq(net, &init_net))
+		kmem_cache_destroy(nf_ct_expect_cachep);
 err2:
 	nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
 			     nf_ct_expect_hsize);
@@ -600,7 +605,8 @@ err1:
 void nf_conntrack_expect_fini(struct net *net)
 {
 	exp_proc_remove(net);
-	kmem_cache_destroy(nf_ct_expect_cachep);
+	if (net_eq(net, &init_net))
+		kmem_cache_destroy(nf_ct_expect_cachep);
 	nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
 			     nf_ct_expect_hsize);
 }
-- 
cgit v1.2.3


From a5c3a8005cb7a36ebcc5b849f6045069ce4f7ca8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:09 +0200
Subject: netfilter: netns nf_conntrack: SIP conntracking in netns

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_sip.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index a006080eb38..6813f1c8863 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -736,6 +736,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb,
 	struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct net *net = nf_ct_net(ct);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	union nf_inet_addr *saddr;
 	struct nf_conntrack_tuple tuple;
@@ -775,7 +776,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb,
 
 	rcu_read_lock();
 	do {
-		exp = __nf_ct_expect_find(&init_net, &tuple);
+		exp = __nf_ct_expect_find(net, &tuple);
 
 		if (!exp || exp->master == ct ||
 		    nfct_help(exp->master)->helper != nfct_help(ct)->helper ||
-- 
cgit v1.2.3


From 84541cc13a3bb31a58c096dde3517461e3ad91c2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:09 +0200
Subject: netfilter: netns nf_conntrack: H323 conntracking in netns

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_h323_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index dfb826c973d..c1504f71cdf 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1210,6 +1210,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
 					       union nf_inet_addr *addr,
 					       __be16 port)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_tuple tuple;
 
@@ -1219,7 +1220,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
 	tuple.dst.u.tcp.port = port;
 	tuple.dst.protonum = IPPROTO_TCP;
 
-	exp = __nf_ct_expect_find(&init_net, &tuple);
+	exp = __nf_ct_expect_find(net, &tuple);
 	if (exp && exp->master == ct)
 		return exp;
 	return NULL;
-- 
cgit v1.2.3


From 3bb0d1c00f86b13bb184193a8f0189ddd6f0459f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:10 +0200
Subject: netfilter: netns nf_conntrack: GRE conntracking in netns

* make keymap list per-netns
* per-netns keymal lock (not strictly necessary)
* flush keymap at netns stop and module unload.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_pptp.c      |  2 +-
 net/netfilter/nf_conntrack_proto_gre.c | 97 ++++++++++++++++++++++++++--------
 2 files changed, 75 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 5db7df5d19b..e47d5de41cc 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -602,7 +602,7 @@ static int __init nf_conntrack_pptp_init(void)
 static void __exit nf_conntrack_pptp_fini(void)
 {
 	nf_conntrack_helper_unregister(&pptp);
-	nf_ct_gre_keymap_flush();
+	nf_ct_gre_keymap_flush(&init_net);
 }
 
 module_init(nf_conntrack_pptp_init);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 5b1273a01fe..a2cdbcbf64c 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -29,8 +29,11 @@
 #include <linux/list.h>
 #include <linux/seq_file.h>
 #include <linux/in.h>
+#include <linux/netdevice.h>
 #include <linux/skbuff.h>
-
+#include <net/dst.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_core.h>
@@ -40,19 +43,23 @@
 #define GRE_TIMEOUT		(30 * HZ)
 #define GRE_STREAM_TIMEOUT	(180 * HZ)
 
-static DEFINE_RWLOCK(nf_ct_gre_lock);
-static LIST_HEAD(gre_keymap_list);
+static int proto_gre_net_id;
+struct netns_proto_gre {
+	rwlock_t		keymap_lock;
+	struct list_head	keymap_list;
+};
 
-void nf_ct_gre_keymap_flush(void)
+void nf_ct_gre_keymap_flush(struct net *net)
 {
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
 	struct nf_ct_gre_keymap *km, *tmp;
 
-	write_lock_bh(&nf_ct_gre_lock);
-	list_for_each_entry_safe(km, tmp, &gre_keymap_list, list) {
+	write_lock_bh(&net_gre->keymap_lock);
+	list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
 		list_del(&km->list);
 		kfree(km);
 	}
-	write_unlock_bh(&nf_ct_gre_lock);
+	write_unlock_bh(&net_gre->keymap_lock);
 }
 EXPORT_SYMBOL(nf_ct_gre_keymap_flush);
 
@@ -67,19 +74,20 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
 }
 
 /* look up the source key for a given tuple */
-static __be16 gre_keymap_lookup(struct nf_conntrack_tuple *t)
+static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
 {
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
 	struct nf_ct_gre_keymap *km;
 	__be16 key = 0;
 
-	read_lock_bh(&nf_ct_gre_lock);
-	list_for_each_entry(km, &gre_keymap_list, list) {
+	read_lock_bh(&net_gre->keymap_lock);
+	list_for_each_entry(km, &net_gre->keymap_list, list) {
 		if (gre_key_cmpfn(km, t)) {
 			key = km->tuple.src.u.gre.key;
 			break;
 		}
 	}
-	read_unlock_bh(&nf_ct_gre_lock);
+	read_unlock_bh(&net_gre->keymap_lock);
 
 	pr_debug("lookup src key 0x%x for ", key);
 	nf_ct_dump_tuple(t);
@@ -91,20 +99,22 @@ static __be16 gre_keymap_lookup(struct nf_conntrack_tuple *t)
 int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
 			 struct nf_conntrack_tuple *t)
 {
+	struct net *net = nf_ct_net(ct);
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
 	struct nf_conn_help *help = nfct_help(ct);
 	struct nf_ct_gre_keymap **kmp, *km;
 
 	kmp = &help->help.ct_pptp_info.keymap[dir];
 	if (*kmp) {
 		/* check whether it's a retransmission */
-		read_lock_bh(&nf_ct_gre_lock);
-		list_for_each_entry(km, &gre_keymap_list, list) {
+		read_lock_bh(&net_gre->keymap_lock);
+		list_for_each_entry(km, &net_gre->keymap_list, list) {
 			if (gre_key_cmpfn(km, t) && km == *kmp) {
-				read_unlock_bh(&nf_ct_gre_lock);
+				read_unlock_bh(&net_gre->keymap_lock);
 				return 0;
 			}
 		}
-		read_unlock_bh(&nf_ct_gre_lock);
+		read_unlock_bh(&net_gre->keymap_lock);
 		pr_debug("trying to override keymap_%s for ct %p\n",
 			 dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
 		return -EEXIST;
@@ -119,9 +129,9 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
 	pr_debug("adding new entry %p: ", km);
 	nf_ct_dump_tuple(&km->tuple);
 
-	write_lock_bh(&nf_ct_gre_lock);
-	list_add_tail(&km->list, &gre_keymap_list);
-	write_unlock_bh(&nf_ct_gre_lock);
+	write_lock_bh(&net_gre->keymap_lock);
+	list_add_tail(&km->list, &net_gre->keymap_list);
+	write_unlock_bh(&net_gre->keymap_lock);
 
 	return 0;
 }
@@ -130,12 +140,14 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
 /* destroy the keymap entries associated with specified master ct */
 void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
 {
+	struct net *net = nf_ct_net(ct);
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
 	struct nf_conn_help *help = nfct_help(ct);
 	enum ip_conntrack_dir dir;
 
 	pr_debug("entering for ct %p\n", ct);
 
-	write_lock_bh(&nf_ct_gre_lock);
+	write_lock_bh(&net_gre->keymap_lock);
 	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
 		if (help->help.ct_pptp_info.keymap[dir]) {
 			pr_debug("removing %p from list\n",
@@ -145,7 +157,7 @@ void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
 			help->help.ct_pptp_info.keymap[dir] = NULL;
 		}
 	}
-	write_unlock_bh(&nf_ct_gre_lock);
+	write_unlock_bh(&net_gre->keymap_lock);
 }
 EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
 
@@ -164,6 +176,7 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
 static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 			     struct nf_conntrack_tuple *tuple)
 {
+	struct net *net = dev_net(skb->dev ? skb->dev : skb->dst->dev);
 	const struct gre_hdr_pptp *pgrehdr;
 	struct gre_hdr_pptp _pgrehdr;
 	__be16 srckey;
@@ -190,7 +203,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	}
 
 	tuple->dst.u.gre.key = pgrehdr->call_id;
-	srckey = gre_keymap_lookup(tuple);
+	srckey = gre_keymap_lookup(net, tuple);
 	tuple->src.u.gre.key = srckey;
 
 	return true;
@@ -285,15 +298,53 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
 #endif
 };
 
+static int proto_gre_net_init(struct net *net)
+{
+	struct netns_proto_gre *net_gre;
+	int rv;
+
+	net_gre = kmalloc(sizeof(struct netns_proto_gre), GFP_KERNEL);
+	if (!net_gre)
+		return -ENOMEM;
+	rwlock_init(&net_gre->keymap_lock);
+	INIT_LIST_HEAD(&net_gre->keymap_list);
+
+	rv = net_assign_generic(net, proto_gre_net_id, net_gre);
+	if (rv < 0)
+		kfree(net_gre);
+	return rv;
+}
+
+static void proto_gre_net_exit(struct net *net)
+{
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+
+	nf_ct_gre_keymap_flush(net);
+	kfree(net_gre);
+}
+
+static struct pernet_operations proto_gre_net_ops = {
+	.init = proto_gre_net_init,
+	.exit = proto_gre_net_exit,
+};
+
 static int __init nf_ct_proto_gre_init(void)
 {
-	return nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4);
+	int rv;
+
+	rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4);
+	if (rv < 0)
+		return rv;
+	rv = register_pernet_gen_device(&proto_gre_net_id, &proto_gre_net_ops);
+	if (rv < 0)
+		nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+	return rv;
 }
 
 static void nf_ct_proto_gre_fini(void)
 {
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
-	nf_ct_gre_keymap_flush();
+	unregister_pernet_gen_device(proto_gre_net_id, &proto_gre_net_ops);
 }
 
 module_init(nf_ct_proto_gre_init);
-- 
cgit v1.2.3


From 0e6e75af921d1f4799eeb9f83a31c86ab7cdeb8f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:10 +0200
Subject: netfilter: netns nf_conntrack: PPTP conntracking in netns

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_pptp.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index e47d5de41cc..373e51e91ce 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -98,6 +98,7 @@ EXPORT_SYMBOL(pptp_msg_name);
 static void pptp_expectfn(struct nf_conn *ct,
 			 struct nf_conntrack_expect *exp)
 {
+	struct net *net = nf_ct_net(ct);
 	typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
 	pr_debug("increasing timeouts\n");
 
@@ -121,7 +122,7 @@ static void pptp_expectfn(struct nf_conn *ct,
 		pr_debug("trying to unexpect other dir: ");
 		nf_ct_dump_tuple(&inv_t);
 
-		exp_other = nf_ct_expect_find_get(&init_net, &inv_t);
+		exp_other = nf_ct_expect_find_get(net, &inv_t);
 		if (exp_other) {
 			/* delete other expectation.  */
 			pr_debug("found\n");
@@ -134,7 +135,8 @@ static void pptp_expectfn(struct nf_conn *ct,
 	rcu_read_unlock();
 }
 
-static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
+static int destroy_sibling_or_exp(struct net *net,
+				  const struct nf_conntrack_tuple *t)
 {
 	const struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_expect *exp;
@@ -143,7 +145,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
 	pr_debug("trying to timeout ct or exp for tuple ");
 	nf_ct_dump_tuple(t);
 
-	h = nf_conntrack_find_get(&init_net, t);
+	h = nf_conntrack_find_get(net, t);
 	if (h)  {
 		sibling = nf_ct_tuplehash_to_ctrack(h);
 		pr_debug("setting timeout of conntrack %p to 0\n", sibling);
@@ -154,7 +156,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
 		nf_ct_put(sibling);
 		return 1;
 	} else {
-		exp = nf_ct_expect_find_get(&init_net, t);
+		exp = nf_ct_expect_find_get(net, t);
 		if (exp) {
 			pr_debug("unexpect_related of expect %p\n", exp);
 			nf_ct_unexpect_related(exp);
@@ -168,6 +170,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
 /* timeout GRE data connections */
 static void pptp_destroy_siblings(struct nf_conn *ct)
 {
+	struct net *net = nf_ct_net(ct);
 	const struct nf_conn_help *help = nfct_help(ct);
 	struct nf_conntrack_tuple t;
 
@@ -178,7 +181,7 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
 	t.dst.protonum = IPPROTO_GRE;
 	t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id;
 	t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id;
-	if (!destroy_sibling_or_exp(&t))
+	if (!destroy_sibling_or_exp(net, &t))
 		pr_debug("failed to timeout original pns->pac ct/exp\n");
 
 	/* try reply (pac->pns) tuple */
@@ -186,7 +189,7 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
 	t.dst.protonum = IPPROTO_GRE;
 	t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id;
 	t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id;
-	if (!destroy_sibling_or_exp(&t))
+	if (!destroy_sibling_or_exp(net, &t))
 		pr_debug("failed to timeout reply pac->pns ct/exp\n");
 }
 
@@ -594,15 +597,32 @@ static struct nf_conntrack_helper pptp __read_mostly = {
 	.expect_policy		= &pptp_exp_policy,
 };
 
+static void nf_conntrack_pptp_net_exit(struct net *net)
+{
+	nf_ct_gre_keymap_flush(net);
+}
+
+static struct pernet_operations nf_conntrack_pptp_net_ops = {
+	.exit = nf_conntrack_pptp_net_exit,
+};
+
 static int __init nf_conntrack_pptp_init(void)
 {
-	return nf_conntrack_helper_register(&pptp);
+	int rv;
+
+	rv = nf_conntrack_helper_register(&pptp);
+	if (rv < 0)
+		return rv;
+	rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops);
+	if (rv < 0)
+		nf_conntrack_helper_unregister(&pptp);
+	return rv;
 }
 
 static void __exit nf_conntrack_pptp_fini(void)
 {
 	nf_conntrack_helper_unregister(&pptp);
-	nf_ct_gre_keymap_flush(&init_net);
+	unregister_pernet_subsys(&nf_conntrack_pptp_net_ops);
 }
 
 module_init(nf_conntrack_pptp_init);
-- 
cgit v1.2.3


From b8b8063e0d0835fb44c88d9fded2be31c9a1757e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:10 +0200
Subject: netfilter: netns nat: fix ipt_MASQUERADE in netns

First, allow entry in notifier hook.
Second, start conntrack cleanup in netns to which netdevice belongs.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_MASQUERADE.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 5e1c81791e5..65c811b27b7 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -119,9 +119,7 @@ static int masq_device_event(struct notifier_block *this,
 			     void *ptr)
 {
 	const struct net_device *dev = ptr;
-
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
+	struct net *net = dev_net(dev);
 
 	if (event == NETDEV_DOWN) {
 		/* Device was downed.  Search entire table for
@@ -129,7 +127,7 @@ static int masq_device_event(struct notifier_block *this,
 		   and forget them. */
 		NF_CT_ASSERT(dev->ifindex != 0);
 
-		nf_ct_iterate_cleanup(&init_net, device_cmp,
+		nf_ct_iterate_cleanup(net, device_cmp,
 				      (void *)(long)dev->ifindex);
 	}
 
-- 
cgit v1.2.3


From e099a173573ce1ba171092aee7bb3c72ea686e59 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:10 +0200
Subject: netfilter: netns nat: per-netns NAT table

Same story as with iptable_filter, iptables_raw tables.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_rule.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index e8b4d0d4439..0a02a8caf3b 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -33,7 +33,7 @@ static struct
 	struct ipt_replace repl;
 	struct ipt_standard entries[3];
 	struct ipt_error term;
-} nat_initial_table __initdata = {
+} nat_initial_table __net_initdata = {
 	.repl = {
 		.name = "nat",
 		.valid_hooks = NAT_VALID_HOOKS,
@@ -58,14 +58,13 @@ static struct
 	.term = IPT_ERROR_INIT,			/* ERROR */
 };
 
-static struct xt_table __nat_table = {
+static struct xt_table nat_table = {
 	.name		= "nat",
 	.valid_hooks	= NAT_VALID_HOOKS,
 	.lock		= __RW_LOCK_UNLOCKED(__nat_table.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };
-static struct xt_table *nat_table;
 
 /* Source NAT */
 static unsigned int ipt_snat_target(struct sk_buff *skb,
@@ -194,9 +193,10 @@ int nf_nat_rule_find(struct sk_buff *skb,
 		     const struct net_device *out,
 		     struct nf_conn *ct)
 {
+	struct net *net = nf_ct_net(ct);
 	int ret;
 
-	ret = ipt_do_table(skb, hooknum, in, out, nat_table);
+	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
 
 	if (ret == NF_ACCEPT) {
 		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
@@ -226,14 +226,32 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
 	.family		= AF_INET,
 };
 
+static int __net_init nf_nat_rule_net_init(struct net *net)
+{
+	net->ipv4.nat_table = ipt_register_table(net, &nat_table,
+						 &nat_initial_table.repl);
+	if (IS_ERR(net->ipv4.nat_table))
+		return PTR_ERR(net->ipv4.nat_table);
+	return 0;
+}
+
+static void __net_exit nf_nat_rule_net_exit(struct net *net)
+{
+	ipt_unregister_table(net->ipv4.nat_table);
+}
+
+static struct pernet_operations nf_nat_rule_net_ops = {
+	.init = nf_nat_rule_net_init,
+	.exit = nf_nat_rule_net_exit,
+};
+
 int __init nf_nat_rule_init(void)
 {
 	int ret;
 
-	nat_table = ipt_register_table(&init_net, &__nat_table,
-				       &nat_initial_table.repl);
-	if (IS_ERR(nat_table))
-		return PTR_ERR(nat_table);
+	ret = register_pernet_subsys(&nf_nat_rule_net_ops);
+	if (ret != 0)
+		goto out;
 	ret = xt_register_target(&ipt_snat_reg);
 	if (ret != 0)
 		goto unregister_table;
@@ -247,8 +265,8 @@ int __init nf_nat_rule_init(void)
  unregister_snat:
 	xt_unregister_target(&ipt_snat_reg);
  unregister_table:
-	ipt_unregister_table(nat_table);
-
+	unregister_pernet_subsys(&nf_nat_rule_net_ops);
+ out:
 	return ret;
 }
 
@@ -256,5 +274,5 @@ void nf_nat_rule_cleanup(void)
 {
 	xt_unregister_target(&ipt_dnat_reg);
 	xt_unregister_target(&ipt_snat_reg);
-	ipt_unregister_table(nat_table);
+	unregister_pernet_subsys(&nf_nat_rule_net_ops);
 }
-- 
cgit v1.2.3


From 0c4c9288ada0e6642d511ef872f10a4781a896ff Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:11 +0200
Subject: netfilter: netns nat: per-netns bysource hash

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_core.c | 72 +++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 5d4a5b70da2..2ac9eaf1a8c 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -37,9 +37,6 @@ static struct nf_conntrack_l3proto *l3proto __read_mostly;
 
 /* Calculated at init based on memory size */
 static unsigned int nf_nat_htable_size __read_mostly;
-static int nf_nat_vmalloced;
-
-static struct hlist_head *bysource __read_mostly;
 
 #define MAX_IP_NAT_PROTO 256
 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
@@ -145,7 +142,8 @@ same_src(const struct nf_conn *ct,
 
 /* Only called for SRC manip */
 static int
-find_appropriate_src(const struct nf_conntrack_tuple *tuple,
+find_appropriate_src(struct net *net,
+		     const struct nf_conntrack_tuple *tuple,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
@@ -155,7 +153,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple,
 	const struct hlist_node *n;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) {
+	hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
 		ct = nat->ct;
 		if (same_src(ct, tuple)) {
 			/* Copy source part from reply tuple. */
@@ -231,6 +229,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 struct nf_conn *ct,
 		 enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	const struct nf_nat_protocol *proto;
 
 	/* 1) If this srcip/proto/src-proto-part is currently mapped,
@@ -242,7 +241,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	   manips not an issue.  */
 	if (maniptype == IP_NAT_MANIP_SRC &&
 	    !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
-		if (find_appropriate_src(orig_tuple, tuple, range)) {
+		if (find_appropriate_src(net, orig_tuple, tuple, range)) {
 			pr_debug("get_unique_tuple: Found current src map\n");
 			if (!nf_nat_used_tuple(tuple, ct))
 				return;
@@ -283,6 +282,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 	struct nf_conn_nat *nat;
 	int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
@@ -334,7 +334,8 @@ nf_nat_setup_info(struct nf_conn *ct,
 		/* nf_conntrack_alter_reply might re-allocate exntension aera */
 		nat = nfct_nat(ct);
 		nat->ct = ct;
-		hlist_add_head_rcu(&nat->bysource, &bysource[srchash]);
+		hlist_add_head_rcu(&nat->bysource,
+				   &net->ipv4.nat_bysource[srchash]);
 		spin_unlock_bh(&nf_nat_lock);
 	}
 
@@ -583,6 +584,40 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
 	.flags		= NF_CT_EXT_F_PREALLOC,
 };
 
+static int __net_init nf_nat_net_init(struct net *net)
+{
+	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
+						      &net->ipv4.nat_vmalloced);
+	if (!net->ipv4.nat_bysource)
+		return -ENOMEM;
+	return 0;
+}
+
+/* Clear NAT section of all conntracks, in case we're loaded again. */
+static int clean_nat(struct nf_conn *i, void *data)
+{
+	struct nf_conn_nat *nat = nfct_nat(i);
+
+	if (!nat)
+		return 0;
+	memset(nat, 0, sizeof(*nat));
+	i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
+	return 0;
+}
+
+static void __net_exit nf_nat_net_exit(struct net *net)
+{
+	nf_ct_iterate_cleanup(net, &clean_nat, NULL);
+	synchronize_rcu();
+	nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
+			     nf_nat_htable_size);
+}
+
+static struct pernet_operations nf_nat_net_ops = {
+	.init = nf_nat_net_init,
+	.exit = nf_nat_net_exit,
+};
+
 static int __init nf_nat_init(void)
 {
 	size_t i;
@@ -599,12 +634,9 @@ static int __init nf_nat_init(void)
 	/* Leave them the same for the moment. */
 	nf_nat_htable_size = nf_conntrack_htable_size;
 
-	bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
-					 &nf_nat_vmalloced);
-	if (!bysource) {
-		ret = -ENOMEM;
+	ret = register_pernet_subsys(&nf_nat_net_ops);
+	if (ret < 0)
 		goto cleanup_extend;
-	}
 
 	/* Sew in builtin protocols. */
 	spin_lock_bh(&nf_nat_lock);
@@ -629,23 +661,9 @@ static int __init nf_nat_init(void)
 	return ret;
 }
 
-/* Clear NAT section of all conntracks, in case we're loaded again. */
-static int clean_nat(struct nf_conn *i, void *data)
-{
-	struct nf_conn_nat *nat = nfct_nat(i);
-
-	if (!nat)
-		return 0;
-	memset(nat, 0, sizeof(*nat));
-	i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
-	return 0;
-}
-
 static void __exit nf_nat_cleanup(void)
 {
-	nf_ct_iterate_cleanup(&init_net, &clean_nat, NULL);
-	synchronize_rcu();
-	nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size);
+	unregister_pernet_subsys(&nf_nat_net_ops);
 	nf_ct_l3proto_put(l3proto);
 	nf_ct_extend_unregister(&nat_extend);
 	rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
-- 
cgit v1.2.3


From 9174c1538fffbb5dddab99563eac6b3d8b212277 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:11 +0200
Subject: netfilter: netns nf_conntrack: fixup DNAT in netns

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_rule.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 0a02a8caf3b..f929352ec0e 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -91,13 +91,13 @@ static unsigned int ipt_snat_target(struct sk_buff *skb,
 }
 
 /* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
-static void warn_if_extra_mangle(__be32 dstip, __be32 srcip)
+static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip)
 {
 	static int warned = 0;
 	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
 	struct rtable *rt;
 
-	if (ip_route_output_key(&init_net, &rt, &fl) != 0)
+	if (ip_route_output_key(net, &rt, &fl) != 0)
 		return;
 
 	if (rt->rt_src != srcip && !warned) {
@@ -130,7 +130,7 @@ static unsigned int ipt_dnat_target(struct sk_buff *skb,
 
 	if (hooknum == NF_INET_LOCAL_OUT &&
 	    mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
-		warn_if_extra_mangle(ip_hdr(skb)->daddr,
+		warn_if_extra_mangle(dev_net(out), ip_hdr(skb)->daddr,
 				     mr->range[0].min_ip);
 
 	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
-- 
cgit v1.2.3


From cfd6e3d74751b62b6d0844e24c911776e40a0135 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:11 +0200
Subject: netfilter: netns nat: PPTP NAT in netns

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_pptp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index e4bdddc6034..9eb171056c6 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -40,6 +40,7 @@ MODULE_ALIAS("ip_nat_pptp");
 static void pptp_nat_expected(struct nf_conn *ct,
 			      struct nf_conntrack_expect *exp)
 {
+	struct net *net = nf_ct_net(ct);
 	const struct nf_conn *master = ct->master;
 	struct nf_conntrack_expect *other_exp;
 	struct nf_conntrack_tuple t;
@@ -73,7 +74,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 
 	pr_debug("trying to unexpect other dir: ");
 	nf_ct_dump_tuple_ip(&t);
-	other_exp = nf_ct_expect_find_get(&init_net, &t);
+	other_exp = nf_ct_expect_find_get(net, &t);
 	if (other_exp) {
 		nf_ct_unexpect_related(other_exp);
 		nf_ct_expect_put(other_exp);
-- 
cgit v1.2.3


From 4de6f16b9ec2422fa7ef9c22f7b1c8d5a55499b4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Oct 2008 11:35:11 +0200
Subject: netfilter: enable netfilter in netns

From kernel perspective, allow entrance in nf_hook_slow().

Stuff which uses nf_register_hook/nf_register_hooks, but otherwise not netns-ready:

	DECnet netfilter
	ipt_CLUSTERIP
	nf_nat_standalone.c together with XFRM (?)
	IPVS
	several individual match modules (like hashlimit)
	ctnetlink
	NOTRACK
	all sorts of queueing and reporting to userspace
	L3 and L4 protocol sysctls, bridge sysctls
	probably something else

Anyway critical mass has been achieved, there is no reason to hide netfilter any longer.

From userspace perspective, allow to manipulate all sorts of
iptables/ip6tables/arptables rules.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/core.c       | 8 --------
 net/netfilter/nf_sockopt.c | 3 ---
 2 files changed, 11 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b16cd79951c..a90ac83c591 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -165,14 +165,6 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
 	unsigned int verdict;
 	int ret = 0;
 
-#ifdef CONFIG_NET_NS
-	struct net *net;
-
-	net = indev == NULL ? dev_net(outdev) : dev_net(indev);
-	if (net != &init_net)
-		return 1;
-#endif
-
 	/* We may already have this, but read-locks nest anyway */
 	rcu_read_lock();
 
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index f9b46de6a3d..8ab829f8657 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -65,9 +65,6 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
 {
 	struct nf_sockopt_ops *ops;
 
-	if (!net_eq(sock_net(sk), &init_net))
-		return ERR_PTR(-ENOPROTOOPT);
-
 	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
 		return ERR_PTR(-EINTR);
 
-- 
cgit v1.2.3


From 73e4022f78acdbe420e8c24a7afbd90f4c8f5077 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 8 Oct 2008 11:35:12 +0200
Subject: netfilter: split netfilter IPv4 defragmentation into a separate
 module

Netfilter connection tracking requires all IPv4 packets to be defragmented.
Both the socket match and the TPROXY target depend on this functionality, so
this patch separates the Netfilter IPv4 defrag hooks into a separate module.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig                     |  5 ++
 net/ipv4/netfilter/Makefile                    |  3 +
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 56 +--------------
 net/ipv4/netfilter/nf_defrag_ipv4.c            | 96 ++++++++++++++++++++++++++
 4 files changed, 107 insertions(+), 53 deletions(-)
 create mode 100644 net/ipv4/netfilter/nf_defrag_ipv4.c

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4e842d56642..07757ac8d5d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -5,10 +5,15 @@
 menu "IP: Netfilter Configuration"
 	depends on INET && NETFILTER
 
+config NF_DEFRAG_IPV4
+	tristate
+	default n
+
 config NF_CONNTRACK_IPV4
 	tristate "IPv4 connection tracking support (required for NAT)"
 	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
+	select NF_DEFRAG_IPV4
 	---help---
 	  Connection tracking keeps a record of what packets have passed
 	  through your machine, in order to figure out how they are related
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 1107edbe478..5f9b650d90f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -18,6 +18,9 @@ obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
 
 obj-$(CONFIG_NF_NAT) += nf_nat.o
 
+# defrag
+obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+
 # NAT helpers (nf_conntrack)
 obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
 obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index af69acc1d0f..4a7c3527539 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -1,3 +1,4 @@
+
 /* (C) 1999-2001 Paul `Rusty' Russell
  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
  *
@@ -24,6 +25,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 
 int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
 			      struct nf_conn *ct,
@@ -63,23 +65,6 @@ static int ipv4_print_tuple(struct seq_file *s,
 			  NIPQUAD(tuple->dst.u3.ip));
 }
 
-/* Returns new sk_buff, or NULL */
-static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
-{
-	int err;
-
-	skb_orphan(skb);
-
-	local_bh_disable();
-	err = ip_defrag(skb, user);
-	local_bh_enable();
-
-	if (!err)
-		ip_send_check(ip_hdr(skb));
-
-	return err;
-}
-
 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -144,28 +129,6 @@ out:
 	return nf_conntrack_confirm(skb);
 }
 
-static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
-					  struct sk_buff *skb,
-					  const struct net_device *in,
-					  const struct net_device *out,
-					  int (*okfn)(struct sk_buff *))
-{
-	/* Previously seen (loopback)?  Ignore.  Do this before
-	   fragment check. */
-	if (skb->nfct)
-		return NF_ACCEPT;
-
-	/* Gather fragments. */
-	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (nf_ct_ipv4_gather_frags(skb,
-					    hooknum == NF_INET_PRE_ROUTING ?
-					    IP_DEFRAG_CONNTRACK_IN :
-					    IP_DEFRAG_CONNTRACK_OUT))
-			return NF_STOLEN;
-	}
-	return NF_ACCEPT;
-}
-
 static unsigned int ipv4_conntrack_in(unsigned int hooknum,
 				      struct sk_buff *skb,
 				      const struct net_device *in,
@@ -194,13 +157,6 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
 /* Connection tracking may drop packets, but never alters them, so
    make it the first hook. */
 static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
-	{
-		.hook		= ipv4_conntrack_defrag,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
-	},
 	{
 		.hook		= ipv4_conntrack_in,
 		.owner		= THIS_MODULE,
@@ -208,13 +164,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP_PRI_CONNTRACK,
 	},
-	{
-		.hook           = ipv4_conntrack_defrag,
-		.owner          = THIS_MODULE,
-		.pf             = PF_INET,
-		.hooknum        = NF_INET_LOCAL_OUT,
-		.priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
-	},
 	{
 		.hook		= ipv4_conntrack_local,
 		.owner		= THIS_MODULE,
@@ -422,6 +371,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 	int ret = 0;
 
 	need_conntrack();
+	nf_defrag_ipv4_enable();
 
 	ret = nf_register_sockopt(&so_getorigdst);
 	if (ret < 0) {
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
new file mode 100644
index 00000000000..aa2c50a180f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -0,0 +1,96 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+/* Returns new sk_buff, or NULL */
+static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+	int err;
+
+	skb_orphan(skb);
+
+	local_bh_disable();
+	err = ip_defrag(skb, user);
+	local_bh_enable();
+
+	if (!err)
+		ip_send_check(ip_hdr(skb));
+
+	return err;
+}
+
+static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
+					  struct sk_buff *skb,
+					  const struct net_device *in,
+					  const struct net_device *out,
+					  int (*okfn)(struct sk_buff *))
+{
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	/* Previously seen (loopback)?  Ignore.  Do this before
+	   fragment check. */
+	if (skb->nfct)
+		return NF_ACCEPT;
+#endif
+
+	/* Gather fragments. */
+	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+		if (nf_ct_ipv4_gather_frags(skb,
+					    hooknum == NF_INET_PRE_ROUTING ?
+					    IP_DEFRAG_CONNTRACK_IN :
+					    IP_DEFRAG_CONNTRACK_OUT))
+			return NF_STOLEN;
+	}
+	return NF_ACCEPT;
+}
+
+static struct nf_hook_ops ipv4_defrag_ops[] = {
+	{
+		.hook		= ipv4_conntrack_defrag,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
+	},
+	{
+		.hook           = ipv4_conntrack_defrag,
+		.owner          = THIS_MODULE,
+		.pf             = PF_INET,
+		.hooknum        = NF_INET_LOCAL_OUT,
+		.priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
+	},
+};
+
+static int __init nf_defrag_init(void)
+{
+	return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+static void __exit nf_defrag_fini(void)
+{
+	nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+void nf_defrag_ipv4_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 9ad2d745a23853927a19789b034d9eb2e62d78ee Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 8 Oct 2008 11:35:12 +0200
Subject: netfilter: iptables tproxy core

The iptables tproxy core is a module that contains the common routines used by
various tproxy related modules (TPROXY target and socket match)

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/Kconfig          | 15 +++++++
 net/netfilter/Makefile         |  3 ++
 net/netfilter/nf_tproxy_core.c | 96 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 114 insertions(+)
 create mode 100644 net/netfilter/nf_tproxy_core.c

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 4a464857f21..ed1dcfb61e1 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -287,6 +287,21 @@ config NF_CT_NETLINK
 	help
 	  This option enables support for a netlink-based userspace interface
 
+# transparent proxy support
+config NETFILTER_TPROXY
+	tristate "Transparent proxying support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on IP_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	help
+	  This option enables transparent proxying support, that is,
+	  support for handling non-locally bound IPv4 TCP and UDP sockets.
+	  For it to work you will have to configure certain iptables rules
+	  and use policy routing. For more information on how to set it up
+	  see Documentation/networking/tproxy.txt.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index f101cf61e6f..fc8bbb48d38 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -34,6 +34,9 @@ obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
 obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
 obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
 
+# transparent proxy support
+obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o
+
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
new file mode 100644
index 00000000000..fe34f4bf74c
--- /dev/null
+++ b/net/netfilter/nf_tproxy_core.c
@@ -0,0 +1,96 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Author: Balazs Scheidler, Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/net.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <net/udp.h>
+#include <net/netfilter/nf_tproxy_core.h>
+
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in, bool listening_only)
+{
+	struct sock *sk;
+
+	/* look up socket */
+	switch (protocol) {
+	case IPPROTO_TCP:
+		if (listening_only)
+			sk = __inet_lookup_listener(net, &tcp_hashinfo,
+						    daddr, ntohs(dport),
+						    in->ifindex);
+		else
+			sk = __inet_lookup(net, &tcp_hashinfo,
+					   saddr, sport, daddr, dport,
+					   in->ifindex);
+		break;
+	case IPPROTO_UDP:
+		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
+
+static void
+nf_tproxy_destructor(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	skb->sk = NULL;
+	skb->destructor = NULL;
+
+	if (sk)
+		nf_tproxy_put_sock(sk);
+}
+
+/* consumes sk */
+int
+nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
+{
+	if (inet_sk(sk)->transparent) {
+		skb->sk = sk;
+		skb->destructor = nf_tproxy_destructor;
+		return 1;
+	} else
+		nf_tproxy_put_sock(sk);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock);
+
+static int __init nf_tproxy_init(void)
+{
+	pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n");
+	pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n");
+	return 0;
+}
+
+module_init(nf_tproxy_init);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_DESCRIPTION("Transparent proxy support core routines");
-- 
cgit v1.2.3


From 136cdc71fd54e77463e570643ac76e2b696e48a0 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 8 Oct 2008 11:35:12 +0200
Subject: netfilter: iptables socket match

Add iptables 'socket' match, which matches packets for which a TCP/UDP
socket lookup succeeds.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/Kconfig     |  15 ++++
 net/netfilter/Makefile    |   1 +
 net/netfilter/xt_socket.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 208 insertions(+)
 create mode 100644 net/netfilter/xt_socket.c

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ed1dcfb61e1..f6c80729948 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -778,6 +778,21 @@ config NETFILTER_XT_MATCH_SCTP
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_MATCH_SOCKET
+	tristate '"socket" match support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on NETFILTER_TPROXY
+	depends on NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	select NF_DEFRAG_IPV4
+	help
+	  This option adds a `socket' match, which can be used to match
+	  packets for which a TCP or UDP socket lookup finds a valid socket.
+	  It can be used in combination with the MARK target and policy
+	  routing to implement full featured non-locally bound sockets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_STATE
 	tristate '"state" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index fc8bbb48d38..1cdc3a13d01 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
new file mode 100644
index 00000000000..ac9db17c7b9
--- /dev/null
+++ b/net/netfilter/xt_socket.c
@@ -0,0 +1,192 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/nf_tproxy_core.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#define XT_SOCKET_HAVE_CONNTRACK 1
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp_fields(const struct sk_buff *skb,
+		    u8 *protocol,
+		    __be32 *raddr,
+		    __be32 *laddr,
+		    __be16 *rport,
+		    __be16 *lport)
+{
+	unsigned int outside_hdrlen = ip_hdrlen(skb);
+	struct iphdr *inside_iph, _inside_iph;
+	struct icmphdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	switch (icmph->type) {
+	case ICMP_DEST_UNREACH:
+	case ICMP_SOURCE_QUENCH:
+	case ICMP_REDIRECT:
+	case ICMP_TIME_EXCEEDED:
+	case ICMP_PARAMETERPROB:
+		break;
+	default:
+		return 1;
+	}
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen +
+					sizeof(struct icmphdr),
+					sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+
+	if (inside_iph->protocol != IPPROTO_TCP &&
+	    inside_iph->protocol != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, outside_hdrlen +
+				   sizeof(struct icmphdr) +
+				   (inside_iph->ihl << 2),
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_iph->protocol;
+	*laddr = inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+
+static bool
+socket_mt(const struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  const struct xt_match *match,
+	  const void *matchinfo,
+	  int offset,
+	  unsigned int protoff,
+	  bool *hotdrop)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct udphdr _hdr, *hp = NULL;
+	struct sock *sk;
+	__be32 daddr, saddr;
+	__be16 dport, sport;
+	u8 protocol;
+#ifdef XT_SOCKET_HAVE_CONNTRACK
+	struct nf_conn const *ct;
+	enum ip_conntrack_info ctinfo;
+#endif
+
+	if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+		hp = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return false;
+
+		protocol = iph->protocol;
+		saddr = iph->saddr;
+		sport = hp->source;
+		daddr = iph->daddr;
+		dport = hp->dest;
+
+	} else if (iph->protocol == IPPROTO_ICMP) {
+		if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+					&sport, &dport))
+			return false;
+	} else {
+		return false;
+	}
+
+#ifdef XT_SOCKET_HAVE_CONNTRACK
+	/* Do the lookup with the original socket address in case this is a
+	 * reply packet of an established SNAT-ted connection. */
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct && (ct != &nf_conntrack_untracked) &&
+	    ((iph->protocol != IPPROTO_ICMP &&
+	      ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) ||
+	     (iph->protocol == IPPROTO_ICMP &&
+	      ctinfo == IP_CT_IS_REPLY + IP_CT_RELATED)) &&
+	    (ct->status & IPS_SRC_NAT_DONE)) {
+
+		daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+		dport = (iph->protocol == IPPROTO_TCP) ?
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+	}
+#endif
+
+	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
+				   saddr, daddr, sport, dport, in, false);
+	if (sk != NULL) {
+		bool wildcard = (inet_sk(sk)->rcv_saddr == 0);
+
+		nf_tproxy_put_sock(sk);
+		if (wildcard)
+			sk = NULL;
+	}
+
+	pr_debug("socket match: proto %u %08x:%u -> %08x:%u "
+		 "(orig %08x:%u) sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport),
+		 ntohl(daddr), ntohs(dport),
+		 ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+
+	return (sk != NULL);
+}
+
+static struct xt_match socket_mt_reg __read_mostly = {
+	.name		= "socket",
+	.family		= AF_INET,
+	.match		= socket_mt,
+	.hooks		= 1 << NF_INET_PRE_ROUTING,
+	.me		= THIS_MODULE,
+};
+
+static int __init socket_mt_init(void)
+{
+	nf_defrag_ipv4_enable();
+	return xt_register_match(&socket_mt_reg);
+}
+
+static void __exit socket_mt_exit(void)
+{
+	xt_unregister_match(&socket_mt_reg);
+}
+
+module_init(socket_mt_init);
+module_exit(socket_mt_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("x_tables socket match module");
+MODULE_ALIAS("ipt_socket");
-- 
cgit v1.2.3


From e84392707e10301b93121e1b74e2823db50cdf9e Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 8 Oct 2008 11:35:12 +0200
Subject: netfilter: iptables TPROXY target

The TPROXY target implements redirection of non-local TCP/UDP traffic to local
sockets. Additionally, it's possible to manipulate the packet mark if and only
if a socket has been found. (We need this because we cannot use multiple
targets in the same iptables rule.)

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/Kconfig     |  15 +++++++
 net/netfilter/Makefile    |   1 +
 net/netfilter/xt_TPROXY.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 net/netfilter/xt_TPROXY.c

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f6c80729948..de18bba619f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -421,6 +421,21 @@ config NETFILTER_XT_TARGET_RATEEST
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_TARGET_TPROXY
+	tristate '"TPROXY" target support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on NETFILTER_TPROXY
+	depends on NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	select NF_DEFRAG_IPV4
+	help
+	  This option adds a `TPROXY' target, which is somewhat similar to
+	  REDIRECT.  It can only be used in the mangle table and is useful
+	  to redirect traffic to a transparent proxy.  It does _not_ depend
+	  on Netfilter connection tracking and NAT, unlike REDIRECT.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_TRACE
 	tristate  '"TRACE" target support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 1cdc3a13d01..8ce67665882 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
new file mode 100644
index 00000000000..183f251d2f0
--- /dev/null
+++ b/net/netfilter/xt_TPROXY.c
@@ -0,0 +1,112 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Author: Balazs Scheidler, Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/inet_sock.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/nf_tproxy_core.h>
+
+static unsigned int
+tproxy_tg(struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  unsigned int hooknum,
+	  const struct xt_target *target,
+	  const void *targinfo)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct xt_tproxy_target_info *tgi = targinfo;
+	struct udphdr _hdr, *hp;
+	struct sock *sk;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return NF_DROP;
+
+	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+				   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
+				   hp->source, tgi->lport ? tgi->lport : hp->dest,
+				   in, true);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && nf_tproxy_assign_sock(skb, sk)) {
+		/* This should be in a separate target, but we don't do multiple
+		   targets on the same rule yet */
+		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
+
+		pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
+			 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
+			 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+		return NF_ACCEPT;
+	}
+
+	pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
+		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
+		 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+	return NF_DROP;
+}
+
+static bool
+tproxy_tg_check(const char *tablename,
+		const void *entry,
+		const struct xt_target *target,
+		void *targetinfo,
+		unsigned int hook_mask)
+{
+	const struct ipt_ip *i = entry;
+
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+	    && !(i->invflags & IPT_INV_PROTO))
+		return true;
+
+	pr_info("xt_TPROXY: Can be used only in combination with "
+		"either -p tcp or -p udp\n");
+	return false;
+}
+
+static struct xt_target tproxy_tg_reg __read_mostly = {
+	.name		= "TPROXY",
+	.family		= AF_INET,
+	.table		= "mangle",
+	.target		= tproxy_tg,
+	.targetsize	= sizeof(struct xt_tproxy_target_info),
+	.checkentry	= tproxy_tg_check,
+	.hooks		= 1 << NF_INET_PRE_ROUTING,
+	.me		= THIS_MODULE,
+};
+
+static int __init tproxy_tg_init(void)
+{
+	nf_defrag_ipv4_enable();
+	return xt_register_target(&tproxy_tg_reg);
+}
+
+static void __exit tproxy_tg_exit(void)
+{
+	xt_unregister_target(&tproxy_tg_reg);
+}
+
+module_init(tproxy_tg_init);
+module_exit(tproxy_tg_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
+MODULE_ALIAS("ipt_TPROXY");
-- 
cgit v1.2.3


From 18219d3f7d6a5bc43825a41e0763158efbdb80d3 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:13 +0200
Subject: netfilter: ebtables: do centralized size checking

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_802_3.c    |  7 +++---
 net/bridge/netfilter/ebt_among.c    |  1 +
 net/bridge/netfilter/ebt_arp.c      |  9 ++++----
 net/bridge/netfilter/ebt_arpreply.c |  9 ++++----
 net/bridge/netfilter/ebt_dnat.c     |  9 ++++----
 net/bridge/netfilter/ebt_ip.c       |  9 ++++----
 net/bridge/netfilter/ebt_ip6.c      |  9 ++++----
 net/bridge/netfilter/ebt_limit.c    | 11 ++++------
 net/bridge/netfilter/ebt_log.c      | 11 +++++-----
 net/bridge/netfilter/ebt_mark.c     |  6 +++---
 net/bridge/netfilter/ebt_mark_m.c   |  7 +++---
 net/bridge/netfilter/ebt_nflog.c    |  4 ++--
 net/bridge/netfilter/ebt_pkttype.c  |  7 +++---
 net/bridge/netfilter/ebt_redirect.c | 11 +++++-----
 net/bridge/netfilter/ebt_snat.c     | 11 +++++-----
 net/bridge/netfilter/ebt_stp.c      | 10 ++++-----
 net/bridge/netfilter/ebt_ulog.c     |  5 +++--
 net/bridge/netfilter/ebt_vlan.c     | 10 ++-------
 net/bridge/netfilter/ebtables.c     | 43 ++++++++++++++++++++++++++++++++-----
 19 files changed, 101 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 98534025360..ccecfbd2a25 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -7,10 +7,10 @@
  * May 2003
  *
  */
-
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_802_3.h>
-#include <linux/module.h>
 
 static int ebt_filter_802_3(const struct sk_buff *skb, const struct net_device *in,
    const struct net_device *out, const void *data, unsigned int datalen)
@@ -42,8 +42,6 @@ static int ebt_802_3_check(const char *tablename, unsigned int hookmask,
 {
 	const struct ebt_802_3_info *info = data;
 
-	if (datalen < sizeof(struct ebt_802_3_info))
-		return -EINVAL;
 	if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK)
 		return -EINVAL;
 
@@ -54,6 +52,7 @@ static struct ebt_match filter_802_3 __read_mostly = {
 	.name		= EBT_802_3_MATCH,
 	.match		= ebt_filter_802_3,
 	.check		= ebt_802_3_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_802_3_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 70b6dca5ea7..b0acb13a390 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -216,6 +216,7 @@ static struct ebt_match filter_among __read_mostly = {
 	.name		= EBT_AMONG_MATCH,
 	.match		= ebt_filter_among,
 	.check		= ebt_among_check,
+	.matchsize	= -1, /* special case */
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index 7c535be7566..385f9cb85bc 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -8,12 +8,12 @@
  *  April, 2002
  *
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_arp.h>
 #include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arp.h>
 
 static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in,
    const struct net_device *out, const void *data, unsigned int datalen)
@@ -105,8 +105,6 @@ static int ebt_arp_check(const char *tablename, unsigned int hookmask,
 {
 	const struct ebt_arp_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_arp_info)))
-		return -EINVAL;
 	if ((e->ethproto != htons(ETH_P_ARP) &&
 	   e->ethproto != htons(ETH_P_RARP)) ||
 	   e->invflags & EBT_IPROTO)
@@ -120,6 +118,7 @@ static struct ebt_match filter_arp __read_mostly = {
 	.name		= EBT_ARP_MATCH,
 	.match		= ebt_filter_arp,
 	.check		= ebt_arp_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_arp_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 0c4279590fc..a860ea6da46 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -8,12 +8,12 @@
  *  August, 2003
  *
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_arpreply.h>
 #include <linux/if_arp.h>
 #include <net/arp.h>
 #include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arpreply.h>
 
 static int ebt_target_reply(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
@@ -63,8 +63,6 @@ static int ebt_target_reply_check(const char *tablename, unsigned int hookmask,
 {
 	const struct ebt_arpreply_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_arpreply_info)))
-		return -EINVAL;
 	if (BASE_CHAIN && info->target == EBT_RETURN)
 		return -EINVAL;
 	if (e->ethproto != htons(ETH_P_ARP) ||
@@ -80,6 +78,7 @@ static struct ebt_target reply_target __read_mostly = {
 	.name		= EBT_ARPREPLY_TARGET,
 	.target		= ebt_target_reply,
 	.check		= ebt_target_reply_check,
+	.targetsize	= XT_ALIGN(sizeof(struct ebt_arpreply_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index ca64c1cc1b4..c2be41e8bb9 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -7,12 +7,12 @@
  *  June, 2002
  *
  */
-
+#include <linux/module.h>
+#include <net/sock.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_nat.h>
-#include <linux/module.h>
-#include <net/sock.h>
 
 static int ebt_target_dnat(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
@@ -39,8 +39,6 @@ static int ebt_target_dnat_check(const char *tablename, unsigned int hookmask,
 	   (hookmask & ~((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT)))) &&
 	   (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) )
 		return -EINVAL;
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_nat_info)))
-		return -EINVAL;
 	if (INVALID_TARGET)
 		return -EINVAL;
 	return 0;
@@ -50,6 +48,7 @@ static struct ebt_target dnat __read_mostly = {
 	.name		= EBT_DNAT_TARGET,
 	.target		= ebt_target_dnat,
 	.check		= ebt_target_dnat_check,
+	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index 65caa00dcf2..c1ae2547e3d 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -11,13 +11,13 @@
  *    Innominate Security Technologies AG <mhopf@innominate.com>
  *    September, 2002
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_ip.h>
 #include <linux/ip.h>
 #include <net/ip.h>
 #include <linux/in.h>
 #include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip.h>
 
 struct tcpudphdr {
 	__be16 src;
@@ -83,8 +83,6 @@ static int ebt_ip_check(const char *tablename, unsigned int hookmask,
 {
 	const struct ebt_ip_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_ip_info)))
-		return -EINVAL;
 	if (e->ethproto != htons(ETH_P_IP) ||
 	   e->invflags & EBT_IPROTO)
 		return -EINVAL;
@@ -111,6 +109,7 @@ static struct ebt_match filter_ip __read_mostly = {
 	.name		= EBT_IP_MATCH,
 	.match		= ebt_filter_ip,
 	.check		= ebt_ip_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_ip_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 36efb3a7524..554dd68637c 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -13,14 +13,14 @@
  *
  *  Jan, 2008
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_ip6.h>
 #include <linux/ipv6.h>
 #include <net/ipv6.h>
 #include <linux/in.h>
 #include <linux/module.h>
 #include <net/dsfield.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip6.h>
 
 struct tcpudphdr {
 	__be16 src;
@@ -97,8 +97,6 @@ static int ebt_ip6_check(const char *tablename, unsigned int hookmask,
 {
 	struct ebt_ip6_info *info = (struct ebt_ip6_info *)data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_ip6_info)))
-		return -EINVAL;
 	if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
 		return -EINVAL;
 	if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK)
@@ -125,6 +123,7 @@ static struct ebt_match filter_ip6 =
 	.name		= EBT_IP6_MATCH,
 	.match		= ebt_filter_ip6,
 	.check		= ebt_ip6_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_ip6_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 8cbdc01c253..3d71f3510ff 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -10,13 +10,12 @@
  *  September, 2003
  *
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_limit.h>
 #include <linux/module.h>
-
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_limit.h>
 
 static DEFINE_SPINLOCK(limit_lock);
 
@@ -71,9 +70,6 @@ static int ebt_limit_check(const char *tablename, unsigned int hookmask,
 {
 	struct ebt_limit_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_limit_info)))
-		return -EINVAL;
-
 	/* Check for overflow. */
 	if (info->burst == 0 ||
 	    user2credits(info->avg * info->burst) < user2credits(info->avg)) {
@@ -94,6 +90,7 @@ static struct ebt_match ebt_limit_reg __read_mostly = {
 	.name		= EBT_LIMIT_MATCH,
 	.match		= ebt_limit_match,
 	.check		= ebt_limit_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_limit_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 8b17c64bcd7..d9596f114a3 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -8,10 +8,6 @@
  *  April, 2002
  *
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_log.h>
-#include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/ip.h>
 #include <linux/in.h>
@@ -21,6 +17,10 @@
 #include <linux/ipv6.h>
 #include <net/ipv6.h>
 #include <linux/in6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_log.h>
+#include <linux/netfilter.h>
 
 static DEFINE_SPINLOCK(ebt_log_lock);
 
@@ -29,8 +29,6 @@ static int ebt_log_check(const char *tablename, unsigned int hookmask,
 {
 	struct ebt_log_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_log_info)))
-		return -EINVAL;
 	if (info->bitmask & ~EBT_LOG_MASK)
 		return -EINVAL;
 	if (info->loglevel >= 8)
@@ -218,6 +216,7 @@ static struct ebt_watcher log =
 	.name		= EBT_LOG_WATCHER,
 	.watcher	= ebt_log,
 	.check		= ebt_log_check,
+	.targetsize	= XT_ALIGN(sizeof(struct ebt_log_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 36723f47db0..bb02412786c 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -13,9 +13,10 @@
  * Marking a frame doesn't really change anything in the frame anyway.
  */
 
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_mark_t.h>
-#include <linux/module.h>
 
 static int ebt_target_mark(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
@@ -42,8 +43,6 @@ static int ebt_target_mark_check(const char *tablename, unsigned int hookmask,
 	const struct ebt_mark_t_info *info = data;
 	int tmp;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_t_info)))
-		return -EINVAL;
 	tmp = info->target | ~EBT_VERDICT_BITS;
 	if (BASE_CHAIN && tmp == EBT_RETURN)
 		return -EINVAL;
@@ -61,6 +60,7 @@ static struct ebt_target mark_target __read_mostly = {
 	.name		= EBT_MARK_TARGET,
 	.target		= ebt_target_mark,
 	.check		= ebt_target_mark_check,
+	.targetsize	= XT_ALIGN(sizeof(struct ebt_mark_t_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index 9b0a4543861..b8ce9eb7170 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -7,10 +7,10 @@
  *  July, 2002
  *
  */
-
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_mark_m.h>
-#include <linux/module.h>
 
 static int ebt_filter_mark(const struct sk_buff *skb,
    const struct net_device *in, const struct net_device *out, const void *data,
@@ -28,8 +28,6 @@ static int ebt_mark_check(const char *tablename, unsigned int hookmask,
 {
 	const struct ebt_mark_m_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_m_info)))
-		return -EINVAL;
 	if (info->bitmask & ~EBT_MARK_MASK)
 		return -EINVAL;
 	if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND))
@@ -43,6 +41,7 @@ static struct ebt_match filter_mark __read_mostly = {
 	.name		= EBT_MARK_MATCH,
 	.match		= ebt_filter_mark,
 	.check		= ebt_mark_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_mark_m_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 8e799aa9e56..88ceb5eb849 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_nflog.h>
 #include <net/netfilter/nf_log.h>
@@ -42,8 +43,6 @@ static int ebt_nflog_check(const char *tablename,
 {
 	struct ebt_nflog_info *info = (struct ebt_nflog_info *)data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_nflog_info)))
-		return -EINVAL;
 	if (info->flags & ~EBT_NFLOG_MASK)
 		return -EINVAL;
 	info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0';
@@ -54,6 +53,7 @@ static struct ebt_watcher nflog __read_mostly = {
 	.name = EBT_NFLOG_WATCHER,
 	.watcher = ebt_nflog,
 	.check = ebt_nflog_check,
+	.targetsize = XT_ALIGN(sizeof(struct ebt_nflog_info)),
 	.me = THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 676db32df3d..019026177f8 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -7,10 +7,10 @@
  *  April, 2003
  *
  */
-
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_pkttype.h>
-#include <linux/module.h>
 
 static int ebt_filter_pkttype(const struct sk_buff *skb,
    const struct net_device *in,
@@ -28,8 +28,6 @@ static int ebt_pkttype_check(const char *tablename, unsigned int hookmask,
 {
 	const struct ebt_pkttype_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_pkttype_info)))
-		return -EINVAL;
 	if (info->invert != 0 && info->invert != 1)
 		return -EINVAL;
 	/* Allow any pkt_type value */
@@ -40,6 +38,7 @@ static struct ebt_match filter_pkttype __read_mostly = {
 	.name		= EBT_PKTTYPE_MATCH,
 	.match		= ebt_filter_pkttype,
 	.check		= ebt_pkttype_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_pkttype_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index b8afe850cf1..04053268386 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -7,13 +7,13 @@
  *  April, 2002
  *
  */
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_redirect.h>
 #include <linux/module.h>
 #include <net/sock.h>
 #include "../br_private.h"
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_redirect.h>
 
 static int ebt_target_redirect(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
@@ -38,8 +38,6 @@ static int ebt_target_redirect_check(const char *tablename, unsigned int hookmas
 {
 	const struct ebt_redirect_info *info = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_redirect_info)))
-		return -EINVAL;
 	if (BASE_CHAIN && info->target == EBT_RETURN)
 		return -EINVAL;
 	CLEAR_BASE_CHAIN_BIT;
@@ -55,6 +53,7 @@ static struct ebt_target redirect_target __read_mostly = {
 	.name		= EBT_REDIRECT_TARGET,
 	.target		= ebt_target_redirect,
 	.check		= ebt_target_redirect_check,
+	.targetsize	= XT_ALIGN(sizeof(struct ebt_redirect_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 5425333dda0..abfbc6c9502 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -7,14 +7,14 @@
  *  June, 2002
  *
  */
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_nat.h>
 #include <linux/module.h>
 #include <net/sock.h>
 #include <linux/if_arp.h>
 #include <net/arp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nat.h>
 
 static int ebt_target_snat(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
@@ -49,8 +49,6 @@ static int ebt_target_snat_check(const char *tablename, unsigned int hookmask,
 	const struct ebt_nat_info *info = data;
 	int tmp;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_nat_info)))
-		return -EINVAL;
 	tmp = info->target | ~EBT_VERDICT_BITS;
 	if (BASE_CHAIN && tmp == EBT_RETURN)
 		return -EINVAL;
@@ -72,6 +70,7 @@ static struct ebt_target snat __read_mostly = {
 	.name		= EBT_SNAT_TARGET,
 	.target		= ebt_target_snat,
 	.check		= ebt_target_snat_check,
+	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 40f36d37607..c7a0a00dac7 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -7,11 +7,11 @@
  *
  *  July, 2003
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_stp.h>
 #include <linux/etherdevice.h>
 #include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_stp.h>
 
 #define BPDU_TYPE_CONFIG 0
 #define BPDU_TYPE_TCN 0x80
@@ -157,15 +157,12 @@ static int ebt_stp_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_stp_info *info = data;
-	const unsigned int len = EBT_ALIGN(sizeof(struct ebt_stp_info));
 	const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
 	const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 
 	if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
 	    !(info->bitmask & EBT_STP_MASK))
 		return -EINVAL;
-	if (datalen != len)
-		return -EINVAL;
 	/* Make sure the match only receives stp frames */
 	if (compare_ether_addr(e->destmac, bridge_ula) ||
 	    compare_ether_addr(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
@@ -178,6 +175,7 @@ static struct ebt_match filter_stp __read_mostly = {
 	.name		= EBT_STP_MATCH,
 	.match		= ebt_filter_stp,
 	.check		= ebt_stp_check,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_stp_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 3b1678cd66f..bdd8a27bba9 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -36,6 +36,7 @@
 #include <linux/timer.h>
 #include <linux/netlink.h>
 #include <linux/netdevice.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_ulog.h>
 #include <net/netfilter/nf_log.h>
@@ -260,8 +261,7 @@ static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
 {
 	struct ebt_ulog_info *uloginfo = data;
 
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_ulog_info)) ||
-	    uloginfo->nlgroup > 31)
+	if (uloginfo->nlgroup > 31)
 		return -EINVAL;
 
 	uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0';
@@ -276,6 +276,7 @@ static struct ebt_watcher ulog __read_mostly = {
 	.name		= EBT_ULOG_WATCHER,
 	.watcher	= ebt_ulog,
 	.check		= ebt_ulog_check,
+	.targetsize	= XT_ALIGN(sizeof(struct ebt_ulog_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index ab60b0dade8..4dba47aefc8 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -22,6 +22,7 @@
 #include <linux/if_vlan.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_vlan.h>
 
@@ -93,14 +94,6 @@ ebt_check_vlan(const char *tablename,
 {
 	struct ebt_vlan_info *info = data;
 
-	/* Parameters buffer overflow check */
-	if (datalen != EBT_ALIGN(sizeof(struct ebt_vlan_info))) {
-		DEBUG_MSG
-		    ("passed size %d is not eq to ebt_vlan_info (%Zd)\n",
-		     datalen, sizeof(struct ebt_vlan_info));
-		return -EINVAL;
-	}
-
 	/* Is it 802.1Q frame checked? */
 	if (e->ethproto != htons(ETH_P_8021Q)) {
 		DEBUG_MSG
@@ -173,6 +166,7 @@ static struct ebt_match filter_vlan __read_mostly = {
 	.name		= EBT_VLAN_MATCH,
 	.match		= ebt_filter_vlan,
 	.check		= ebt_check_vlan,
+	.matchsize	= XT_ALIGN(sizeof(struct ebt_vlan_info)),
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 32afff859e4..b04e288d20f 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -19,6 +19,7 @@
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
@@ -59,8 +60,9 @@ static LIST_HEAD(ebt_targets);
 static LIST_HEAD(ebt_matches);
 static LIST_HEAD(ebt_watchers);
 
-static struct ebt_target ebt_standard_target =
-{ {NULL, NULL}, EBT_STANDARD_TARGET, NULL, NULL, NULL, NULL};
+static struct ebt_target ebt_standard_target = {
+	.name = "standard",
+};
 
 static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
    const struct sk_buff *skb, unsigned int hooknr, const struct net_device *in,
@@ -350,6 +352,18 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 		return -ENOENT;
 	}
 	mutex_unlock(&ebt_mutex);
+	if (XT_ALIGN(match->matchsize) != m->match_size &&
+	    match->matchsize != -1) {
+		/*
+		 * ebt_among is exempt from centralized matchsize checking
+		 * because it uses a dynamic-size data set.
+		 */
+		printk(KERN_WARNING "ebtables: %s match: "
+		       "invalid size %Zu != %u\n",
+		       match->name, XT_ALIGN(match->matchsize), m->match_size);
+		module_put(match->me);
+		return -EINVAL;
+	}
 	if (match->check &&
 	   match->check(name, hookmask, e, m->data, m->match_size) != 0) {
 		BUGPRINT("match->check failed\n");
@@ -380,6 +394,14 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
 		return -ENOENT;
 	}
 	mutex_unlock(&ebt_mutex);
+	if (XT_ALIGN(watcher->targetsize) != w->watcher_size) {
+		printk(KERN_WARNING "ebtables: %s watcher: "
+		       "invalid size %Zu != %u\n",
+		       watcher->name, XT_ALIGN(watcher->targetsize),
+		       w->watcher_size);
+		module_put(watcher->me);
+		return -EINVAL;
+	}
 	if (watcher->check &&
 	   watcher->check(name, hookmask, e, w->data, w->watcher_size) != 0) {
 		BUGPRINT("watcher->check failed\n");
@@ -681,9 +703,20 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 			ret = -EFAULT;
 			goto cleanup_watchers;
 		}
-	} else if (t->target_size > gap - sizeof(struct ebt_entry_target) ||
-	   (t->u.target->check &&
-	   t->u.target->check(name, hookmask, e, t->data, t->target_size) != 0)){
+	} else if (t->target_size > gap - sizeof(struct ebt_entry_target)) {
+		module_put(t->u.target->me);
+		ret = -EFAULT;
+		goto cleanup_watchers;
+	} else if (XT_ALIGN(target->targetsize) != t->target_size) {
+		printk(KERN_WARNING "ebtables: %s target: "
+		       "invalid size %Zu != %u\n",
+		       target->name, XT_ALIGN(target->targetsize),
+		       t->target_size);
+		module_put(t->u.target->me);
+		ret = -EINVAL;
+		goto cleanup_watchers;
+	} else if (t->u.target->check &&
+	    t->u.target->check(name, hookmask, e, t->data, t->target_size) != 0) {
 		module_put(t->u.target->me);
 		ret = -EFAULT;
 		goto cleanup_watchers;
-- 
cgit v1.2.3


From 19eda879a136889110c692dec4c2ab59e0e43cef Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:13 +0200
Subject: netfilter: change return types of check functions for Ebtables
 extensions

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_802_3.c    |  6 +++---
 net/bridge/netfilter/ebt_among.c    | 15 ++++++++-------
 net/bridge/netfilter/ebt_arp.c      |  8 ++++----
 net/bridge/netfilter/ebt_arpreply.c | 10 +++++-----
 net/bridge/netfilter/ebt_dnat.c     | 10 +++++-----
 net/bridge/netfilter/ebt_ip.c       | 16 ++++++++--------
 net/bridge/netfilter/ebt_ip6.c      | 16 ++++++++--------
 net/bridge/netfilter/ebt_limit.c    |  6 +++---
 net/bridge/netfilter/ebt_log.c      |  8 ++++----
 net/bridge/netfilter/ebt_mark.c     | 10 +++++-----
 net/bridge/netfilter/ebt_mark_m.c   | 10 +++++-----
 net/bridge/netfilter/ebt_nflog.c    | 12 ++++++------
 net/bridge/netfilter/ebt_pkttype.c  |  6 +++---
 net/bridge/netfilter/ebt_redirect.c | 10 +++++-----
 net/bridge/netfilter/ebt_snat.c     | 14 +++++++-------
 net/bridge/netfilter/ebt_stp.c      |  8 ++++----
 net/bridge/netfilter/ebt_ulog.c     | 21 ++++++++++++---------
 net/bridge/netfilter/ebt_vlan.c     | 16 ++++++++--------
 net/bridge/netfilter/ebtables.c     |  6 +++---
 19 files changed, 106 insertions(+), 102 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index ccecfbd2a25..868df9c1e42 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -37,15 +37,15 @@ static int ebt_filter_802_3(const struct sk_buff *skb, const struct net_device *
 }
 
 static struct ebt_match filter_802_3;
-static int ebt_802_3_check(const char *tablename, unsigned int hookmask,
+static bool ebt_802_3_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_802_3_info *info = data;
 
 	if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK)
-		return -EINVAL;
+		return false;
 
-	return 0;
+	return true;
 }
 
 static struct ebt_match filter_802_3 __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index b0acb13a390..95e2e70ac90 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -177,9 +177,10 @@ static int ebt_filter_among(const struct sk_buff *skb,
 	return EBT_MATCH;
 }
 
-static int ebt_among_check(const char *tablename, unsigned int hookmask,
-			   const struct ebt_entry *e, void *data,
-			   unsigned int datalen)
+static bool
+ebt_among_check(const char *tablename, unsigned int hookmask,
+		const struct ebt_entry *e, void *data,
+		unsigned int datalen)
 {
 	const struct ebt_among_info *info = data;
 	int expected_length = sizeof(struct ebt_among_info);
@@ -197,19 +198,19 @@ static int ebt_among_check(const char *tablename, unsigned int hookmask,
 		       "against expected %d, rounded to %Zd\n",
 		       datalen, expected_length,
 		       EBT_ALIGN(expected_length));
-		return -EINVAL;
+		return false;
 	}
 	if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) {
 		printk(KERN_WARNING
 		       "ebtables: among: dst integrity fail: %x\n", -err);
-		return -EINVAL;
+		return false;
 	}
 	if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) {
 		printk(KERN_WARNING
 		       "ebtables: among: src integrity fail: %x\n", -err);
-		return -EINVAL;
+		return false;
 	}
-	return 0;
+	return true;
 }
 
 static struct ebt_match filter_among __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index 385f9cb85bc..cb33672380d 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -100,7 +100,7 @@ static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in
 	return EBT_MATCH;
 }
 
-static int ebt_arp_check(const char *tablename, unsigned int hookmask,
+static bool ebt_arp_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_arp_info *info = data;
@@ -108,10 +108,10 @@ static int ebt_arp_check(const char *tablename, unsigned int hookmask,
 	if ((e->ethproto != htons(ETH_P_ARP) &&
 	   e->ethproto != htons(ETH_P_RARP)) ||
 	   e->invflags & EBT_IPROTO)
-		return -EINVAL;
+		return false;
 	if (info->bitmask & ~EBT_ARP_MASK || info->invflags & ~EBT_ARP_MASK)
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_match filter_arp __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index a860ea6da46..c298d3deffa 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -58,20 +58,20 @@ static int ebt_target_reply(struct sk_buff *skb, unsigned int hooknr,
 	return info->target;
 }
 
-static int ebt_target_reply_check(const char *tablename, unsigned int hookmask,
+static bool ebt_target_reply_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_arpreply_info *info = data;
 
 	if (BASE_CHAIN && info->target == EBT_RETURN)
-		return -EINVAL;
+		return false;
 	if (e->ethproto != htons(ETH_P_ARP) ||
 	    e->invflags & EBT_IPROTO)
-		return -EINVAL;
+		return false;
 	CLEAR_BASE_CHAIN_BIT;
 	if (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING))
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_target reply_target __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index c2be41e8bb9..6ddea2184e9 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -27,21 +27,21 @@ static int ebt_target_dnat(struct sk_buff *skb, unsigned int hooknr,
 	return info->target;
 }
 
-static int ebt_target_dnat_check(const char *tablename, unsigned int hookmask,
+static bool ebt_target_dnat_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_nat_info *info = data;
 
 	if (BASE_CHAIN && info->target == EBT_RETURN)
-		return -EINVAL;
+		return false;
 	CLEAR_BASE_CHAIN_BIT;
 	if ( (strcmp(tablename, "nat") ||
 	   (hookmask & ~((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT)))) &&
 	   (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) )
-		return -EINVAL;
+		return false;
 	if (INVALID_TARGET)
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_target dnat __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index c1ae2547e3d..cbf0918ec16 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -78,31 +78,31 @@ static int ebt_filter_ip(const struct sk_buff *skb, const struct net_device *in,
 	return EBT_MATCH;
 }
 
-static int ebt_ip_check(const char *tablename, unsigned int hookmask,
+static bool ebt_ip_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_ip_info *info = data;
 
 	if (e->ethproto != htons(ETH_P_IP) ||
 	   e->invflags & EBT_IPROTO)
-		return -EINVAL;
+		return false;
 	if (info->bitmask & ~EBT_IP_MASK || info->invflags & ~EBT_IP_MASK)
-		return -EINVAL;
+		return false;
 	if (info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT)) {
 		if (info->invflags & EBT_IP_PROTO)
-			return -EINVAL;
+			return false;
 		if (info->protocol != IPPROTO_TCP &&
 		    info->protocol != IPPROTO_UDP &&
 		    info->protocol != IPPROTO_UDPLITE &&
 		    info->protocol != IPPROTO_SCTP &&
 		    info->protocol != IPPROTO_DCCP)
-			 return -EINVAL;
+			 return false;
 	}
 	if (info->bitmask & EBT_IP_DPORT && info->dport[0] > info->dport[1])
-		return -EINVAL;
+		return false;
 	if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1])
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_match filter_ip __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 554dd68637c..1230c9ee394 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -92,30 +92,30 @@ static int ebt_filter_ip6(const struct sk_buff *skb,
 	return EBT_MATCH;
 }
 
-static int ebt_ip6_check(const char *tablename, unsigned int hookmask,
+static bool ebt_ip6_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	struct ebt_ip6_info *info = (struct ebt_ip6_info *)data;
 
 	if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
-		return -EINVAL;
+		return false;
 	if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK)
-		return -EINVAL;
+		return false;
 	if (info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT)) {
 		if (info->invflags & EBT_IP6_PROTO)
-			return -EINVAL;
+			return false;
 		if (info->protocol != IPPROTO_TCP &&
 		    info->protocol != IPPROTO_UDP &&
 		    info->protocol != IPPROTO_UDPLITE &&
 		    info->protocol != IPPROTO_SCTP &&
 		    info->protocol != IPPROTO_DCCP)
-			 return -EINVAL;
+			return false;
 	}
 	if (info->bitmask & EBT_IP6_DPORT && info->dport[0] > info->dport[1])
-		return -EINVAL;
+		return false;
 	if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1])
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_match filter_ip6 =
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 3d71f3510ff..9b04f2be94e 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -65,7 +65,7 @@ user2credits(u_int32_t user)
 	return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE;
 }
 
-static int ebt_limit_check(const char *tablename, unsigned int hookmask,
+static bool ebt_limit_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	struct ebt_limit_info *info = data;
@@ -75,7 +75,7 @@ static int ebt_limit_check(const char *tablename, unsigned int hookmask,
 	    user2credits(info->avg * info->burst) < user2credits(info->avg)) {
 		printk("Overflow in ebt_limit, try lower: %u/%u\n",
 			info->avg, info->burst);
-		return -EINVAL;
+		return false;
 	}
 
 	/* User avg in seconds * EBT_LIMIT_SCALE: convert to jiffies * 128. */
@@ -83,7 +83,7 @@ static int ebt_limit_check(const char *tablename, unsigned int hookmask,
 	info->credit = user2credits(info->avg * info->burst);
 	info->credit_cap = user2credits(info->avg * info->burst);
 	info->cost = user2credits(info->avg);
-	return 0;
+	return true;
 }
 
 static struct ebt_match ebt_limit_reg __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index d9596f114a3..f3d6d5ec2dc 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -24,17 +24,17 @@
 
 static DEFINE_SPINLOCK(ebt_log_lock);
 
-static int ebt_log_check(const char *tablename, unsigned int hookmask,
+static bool ebt_log_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	struct ebt_log_info *info = data;
 
 	if (info->bitmask & ~EBT_LOG_MASK)
-		return -EINVAL;
+		return false;
 	if (info->loglevel >= 8)
-		return -EINVAL;
+		return false;
 	info->prefix[EBT_LOG_PREFIX_SIZE - 1] = '\0';
-	return 0;
+	return true;
 }
 
 struct tcpudphdr
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index bb02412786c..b85c73895ae 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -37,7 +37,7 @@ static int ebt_target_mark(struct sk_buff *skb, unsigned int hooknr,
 	return info->target | ~EBT_VERDICT_BITS;
 }
 
-static int ebt_target_mark_check(const char *tablename, unsigned int hookmask,
+static bool ebt_target_mark_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_mark_t_info *info = data;
@@ -45,15 +45,15 @@ static int ebt_target_mark_check(const char *tablename, unsigned int hookmask,
 
 	tmp = info->target | ~EBT_VERDICT_BITS;
 	if (BASE_CHAIN && tmp == EBT_RETURN)
-		return -EINVAL;
+		return false;
 	CLEAR_BASE_CHAIN_BIT;
 	if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
-		return -EINVAL;
+		return false;
 	tmp = info->target & ~EBT_VERDICT_BITS;
 	if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE &&
 	    tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE)
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_target mark_target __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index b8ce9eb7170..b2707d772c9 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -23,18 +23,18 @@ static int ebt_filter_mark(const struct sk_buff *skb,
 	return !(((skb->mark & info->mask) == info->mark) ^ info->invert);
 }
 
-static int ebt_mark_check(const char *tablename, unsigned int hookmask,
+static bool ebt_mark_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_mark_m_info *info = data;
 
 	if (info->bitmask & ~EBT_MARK_MASK)
-		return -EINVAL;
+		return false;
 	if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND))
-		return -EINVAL;
+		return false;
 	if (!info->bitmask)
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_match filter_mark __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 88ceb5eb849..a6954eb3f58 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -36,17 +36,17 @@ static void ebt_nflog(const struct sk_buff *skb,
 	nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, "%s", info->prefix);
 }
 
-static int ebt_nflog_check(const char *tablename,
-			   unsigned int hookmask,
-			   const struct ebt_entry *e,
-			   void *data, unsigned int datalen)
+static bool ebt_nflog_check(const char *tablename,
+			    unsigned int hookmask,
+			    const struct ebt_entry *e,
+			    void *data, unsigned int datalen)
 {
 	struct ebt_nflog_info *info = (struct ebt_nflog_info *)data;
 
 	if (info->flags & ~EBT_NFLOG_MASK)
-		return -EINVAL;
+		return false;
 	info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0';
-	return 0;
+	return true;
 }
 
 static struct ebt_watcher nflog __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 019026177f8..4dcd3b86cff 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -23,15 +23,15 @@ static int ebt_filter_pkttype(const struct sk_buff *skb,
 	return (skb->pkt_type != info->pkt_type) ^ info->invert;
 }
 
-static int ebt_pkttype_check(const char *tablename, unsigned int hookmask,
+static bool ebt_pkttype_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_pkttype_info *info = data;
 
 	if (info->invert != 0 && info->invert != 1)
-		return -EINVAL;
+		return false;
 	/* Allow any pkt_type value */
-	return 0;
+	return true;
 }
 
 static struct ebt_match filter_pkttype __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 04053268386..d2076f4227c 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -33,20 +33,20 @@ static int ebt_target_redirect(struct sk_buff *skb, unsigned int hooknr,
 	return info->target;
 }
 
-static int ebt_target_redirect_check(const char *tablename, unsigned int hookmask,
+static bool ebt_target_redirect_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_redirect_info *info = data;
 
 	if (BASE_CHAIN && info->target == EBT_RETURN)
-		return -EINVAL;
+		return false;
 	CLEAR_BASE_CHAIN_BIT;
 	if ( (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING)) &&
 	     (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) )
-		return -EINVAL;
+		return false;
 	if (INVALID_TARGET)
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_target redirect_target __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index abfbc6c9502..5a5a16acca0 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -43,7 +43,7 @@ out:
 	return info->target | ~EBT_VERDICT_BITS;
 }
 
-static int ebt_target_snat_check(const char *tablename, unsigned int hookmask,
+static bool ebt_target_snat_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_nat_info *info = data;
@@ -51,19 +51,19 @@ static int ebt_target_snat_check(const char *tablename, unsigned int hookmask,
 
 	tmp = info->target | ~EBT_VERDICT_BITS;
 	if (BASE_CHAIN && tmp == EBT_RETURN)
-		return -EINVAL;
+		return false;
 	CLEAR_BASE_CHAIN_BIT;
 	if (strcmp(tablename, "nat"))
-		return -EINVAL;
+		return false;
 	if (hookmask & ~(1 << NF_BR_POST_ROUTING))
-		return -EINVAL;
+		return false;
 
 	if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
-		return -EINVAL;
+		return false;
 	tmp = info->target | EBT_VERDICT_BITS;
 	if ((tmp & ~NAT_ARP_BIT) != ~NAT_ARP_BIT)
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 static struct ebt_target snat __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index c7a0a00dac7..37d9480a00c 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -153,7 +153,7 @@ static int ebt_filter_stp(const struct sk_buff *skb, const struct net_device *in
 	return EBT_MATCH;
 }
 
-static int ebt_stp_check(const char *tablename, unsigned int hookmask,
+static bool ebt_stp_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	const struct ebt_stp_info *info = data;
@@ -162,13 +162,13 @@ static int ebt_stp_check(const char *tablename, unsigned int hookmask,
 
 	if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
 	    !(info->bitmask & EBT_STP_MASK))
-		return -EINVAL;
+		return false;
 	/* Make sure the match only receives stp frames */
 	if (compare_ether_addr(e->destmac, bridge_ula) ||
 	    compare_ether_addr(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
-		return -EINVAL;
+		return false;
 
-	return 0;
+	return true;
 }
 
 static struct ebt_match filter_stp __read_mostly = {
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index bdd8a27bba9..e13a005f58a 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -255,14 +255,13 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
 	ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
 }
 
-
-static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
+static bool ebt_ulog_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
 	struct ebt_ulog_info *uloginfo = data;
 
 	if (uloginfo->nlgroup > 31)
-		return -EINVAL;
+		return false;
 
 	uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0';
 
@@ -288,12 +287,13 @@ static const struct nf_logger ebt_ulog_logger = {
 
 static int __init ebt_ulog_init(void)
 {
-	int i, ret = 0;
+	bool ret = true;
+	int i;
 
 	if (nlbufsiz >= 128*1024) {
 		printk(KERN_NOTICE "ebt_ulog: Netlink buffer has to be <= 128kB,"
 		       " please try a smaller nlbufsiz parameter.\n");
-		return -EINVAL;
+		return false;
 	}
 
 	/* initialize ulog_buffers */
@@ -305,12 +305,15 @@ static int __init ebt_ulog_init(void)
 	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
 					  EBT_ULOG_MAXNLGROUPS, NULL, NULL,
 					  THIS_MODULE);
-	if (!ebtulognl)
-		ret = -ENOMEM;
-	else if ((ret = ebt_register_watcher(&ulog)))
+	if (!ebtulognl) {
+		printk(KERN_WARNING KBUILD_MODNAME ": out of memory trying to "
+		       "call netlink_kernel_create\n");
+		ret = false;
+	} else if (ebt_register_watcher(&ulog) != 0) {
 		netlink_kernel_release(ebtulognl);
+	}
 
-	if (ret == 0)
+	if (ret)
 		nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
 
 	return ret;
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 4dba47aefc8..fc88d5d59e0 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -87,7 +87,7 @@ ebt_filter_vlan(const struct sk_buff *skb,
 	return EBT_MATCH;
 }
 
-static int
+static bool
 ebt_check_vlan(const char *tablename,
 	       unsigned int hooknr,
 	       const struct ebt_entry *e, void *data, unsigned int datalen)
@@ -99,7 +99,7 @@ ebt_check_vlan(const char *tablename,
 		DEBUG_MSG
 		    ("passed entry proto %2.4X is not 802.1Q (8100)\n",
 		     (unsigned short) ntohs(e->ethproto));
-		return -EINVAL;
+		return false;
 	}
 
 	/* Check for bitmask range
@@ -107,14 +107,14 @@ ebt_check_vlan(const char *tablename,
 	if (info->bitmask & ~EBT_VLAN_MASK) {
 		DEBUG_MSG("bitmask %2X is out of mask (%2X)\n",
 			  info->bitmask, EBT_VLAN_MASK);
-		return -EINVAL;
+		return false;
 	}
 
 	/* Check for inversion flags range */
 	if (info->invflags & ~EBT_VLAN_MASK) {
 		DEBUG_MSG("inversion flags %2X is out of mask (%2X)\n",
 			  info->invflags, EBT_VLAN_MASK);
-		return -EINVAL;
+		return false;
 	}
 
 	/* Reserved VLAN ID (VID) values
@@ -129,7 +129,7 @@ ebt_check_vlan(const char *tablename,
 				DEBUG_MSG
 				    ("id %d is out of range (1-4096)\n",
 				     info->id);
-				return -EINVAL;
+				return false;
 			}
 			/* Note: This is valid VLAN-tagged frame point.
 			 * Any value of user_priority are acceptable,
@@ -144,7 +144,7 @@ ebt_check_vlan(const char *tablename,
 		if ((unsigned char) info->prio > 7) {
 			DEBUG_MSG("prio %d is out of range (0-7)\n",
 			     info->prio);
-			return -EINVAL;
+			return false;
 		}
 	}
 	/* Check for encapsulated proto range - it is possible to be
@@ -155,11 +155,11 @@ ebt_check_vlan(const char *tablename,
 			DEBUG_MSG
 			    ("encap frame length %d is less than minimal\n",
 			     ntohs(info->encap));
-			return -EINVAL;
+			return false;
 		}
 	}
 
-	return 0;
+	return true;
 }
 
 static struct ebt_match filter_vlan __read_mostly = {
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index b04e288d20f..fe499527729 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -365,7 +365,7 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 		return -EINVAL;
 	}
 	if (match->check &&
-	   match->check(name, hookmask, e, m->data, m->match_size) != 0) {
+	    !match->check(name, hookmask, e, m->data, m->match_size)) {
 		BUGPRINT("match->check failed\n");
 		module_put(match->me);
 		return -EINVAL;
@@ -403,7 +403,7 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
 		return -EINVAL;
 	}
 	if (watcher->check &&
-	   watcher->check(name, hookmask, e, w->data, w->watcher_size) != 0) {
+	    !watcher->check(name, hookmask, e, w->data, w->watcher_size)) {
 		BUGPRINT("watcher->check failed\n");
 		module_put(watcher->me);
 		return -EINVAL;
@@ -716,7 +716,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 		ret = -EINVAL;
 		goto cleanup_watchers;
 	} else if (t->u.target->check &&
-	    t->u.target->check(name, hookmask, e, t->data, t->target_size) != 0) {
+	    !t->u.target->check(name, hookmask, e, t->data, t->target_size)) {
 		module_put(t->u.target->me);
 		ret = -EFAULT;
 		goto cleanup_watchers;
-- 
cgit v1.2.3


From 8cc784eec6676b58e7f60419c88179aaa97bf71c Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:13 +0200
Subject: netfilter: change return types of match functions for ebtables
 extensions

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_802_3.c   | 13 +++++------
 net/bridge/netfilter/ebt_among.c   | 44 +++++++++++++++++---------------------
 net/bridge/netfilter/ebt_arp.c     | 35 +++++++++++++++---------------
 net/bridge/netfilter/ebt_ip.c      | 25 +++++++++++-----------
 net/bridge/netfilter/ebt_ip6.c     | 26 +++++++++++-----------
 net/bridge/netfilter/ebt_limit.c   |  6 +++---
 net/bridge/netfilter/ebt_mark_m.c  |  6 +++---
 net/bridge/netfilter/ebt_pkttype.c |  4 ++--
 net/bridge/netfilter/ebt_stp.c     | 39 +++++++++++++++++----------------
 net/bridge/netfilter/ebt_vlan.c    |  8 +++----
 10 files changed, 103 insertions(+), 103 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 868df9c1e42..8ebe62b9bcc 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -12,7 +12,8 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_802_3.h>
 
-static int ebt_filter_802_3(const struct sk_buff *skb, const struct net_device *in,
+static bool ebt_filter_802_3(const struct sk_buff *skb,
+   const struct net_device *in,
    const struct net_device *out, const void *data, unsigned int datalen)
 {
 	const struct ebt_802_3_info *info = data;
@@ -21,19 +22,19 @@ static int ebt_filter_802_3(const struct sk_buff *skb, const struct net_device *
 
 	if (info->bitmask & EBT_802_3_SAP) {
 		if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP))
-				return EBT_NOMATCH;
+			return false;
 		if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP))
-				return EBT_NOMATCH;
+			return false;
 	}
 
 	if (info->bitmask & EBT_802_3_TYPE) {
 		if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE))
-			return EBT_NOMATCH;
+			return false;
 		if (FWINV(info->type != type, EBT_802_3_TYPE))
-			return EBT_NOMATCH;
+			return false;
 	}
 
-	return EBT_MATCH;
+	return true;
 }
 
 static struct ebt_match filter_802_3;
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 95e2e70ac90..bfdc67bcbfa 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -14,8 +14,8 @@
 #include <linux/if_arp.h>
 #include <linux/module.h>
 
-static int ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
-				     const char *mac, __be32 ip)
+static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
+				      const char *mac, __be32 ip)
 {
 	/* You may be puzzled as to how this code works.
 	 * Some tricks were used, refer to
@@ -33,23 +33,19 @@ static int ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
 	if (ip) {
 		for (i = start; i < limit; i++) {
 			p = &wh->pool[i];
-			if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) {
-				if (p->ip == 0 || p->ip == ip) {
-					return 1;
-				}
-			}
+			if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+				if (p->ip == 0 || p->ip == ip)
+					return true;
 		}
 	} else {
 		for (i = start; i < limit; i++) {
 			p = &wh->pool[i];
-			if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) {
-				if (p->ip == 0) {
-					return 1;
-				}
-			}
+			if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+				if (p->ip == 0)
+					return true;
 		}
 	}
-	return 0;
+	return false;
 }
 
 static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash
@@ -131,10 +127,10 @@ static int get_ip_src(const struct sk_buff *skb, __be32 *addr)
 	return 0;
 }
 
-static int ebt_filter_among(const struct sk_buff *skb,
-			    const struct net_device *in,
-			    const struct net_device *out, const void *data,
-			    unsigned int datalen)
+static bool ebt_filter_among(const struct sk_buff *skb,
+			     const struct net_device *in,
+			     const struct net_device *out, const void *data,
+			     unsigned int datalen)
 {
 	const struct ebt_among_info *info = data;
 	const char *dmac, *smac;
@@ -147,34 +143,34 @@ static int ebt_filter_among(const struct sk_buff *skb,
 	if (wh_src) {
 		smac = eth_hdr(skb)->h_source;
 		if (get_ip_src(skb, &sip))
-			return EBT_NOMATCH;
+			return false;
 		if (!(info->bitmask & EBT_AMONG_SRC_NEG)) {
 			/* we match only if it contains */
 			if (!ebt_mac_wormhash_contains(wh_src, smac, sip))
-				return EBT_NOMATCH;
+				return false;
 		} else {
 			/* we match only if it DOES NOT contain */
 			if (ebt_mac_wormhash_contains(wh_src, smac, sip))
-				return EBT_NOMATCH;
+				return false;
 		}
 	}
 
 	if (wh_dst) {
 		dmac = eth_hdr(skb)->h_dest;
 		if (get_ip_dst(skb, &dip))
-			return EBT_NOMATCH;
+			return false;
 		if (!(info->bitmask & EBT_AMONG_DST_NEG)) {
 			/* we match only if it contains */
 			if (!ebt_mac_wormhash_contains(wh_dst, dmac, dip))
-				return EBT_NOMATCH;
+				return false;
 		} else {
 			/* we match only if it DOES NOT contain */
 			if (ebt_mac_wormhash_contains(wh_dst, dmac, dip))
-				return EBT_NOMATCH;
+				return false;
 		}
 	}
 
-	return EBT_MATCH;
+	return true;
 }
 
 static bool
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index cb33672380d..f1f0bcf5524 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -15,7 +15,8 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_arp.h>
 
-static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in,
+static bool ebt_filter_arp(const struct sk_buff *skb,
+   const struct net_device *in,
    const struct net_device *out, const void *data, unsigned int datalen)
 {
 	const struct ebt_arp_info *info = data;
@@ -24,42 +25,42 @@ static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in
 
 	ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
 	if (ah == NULL)
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode !=
 	   ah->ar_op, EBT_ARP_OPCODE))
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype !=
 	   ah->ar_hrd, EBT_ARP_HTYPE))
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype !=
 	   ah->ar_pro, EBT_ARP_PTYPE))
-		return EBT_NOMATCH;
+		return false;
 
 	if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) {
 		const __be32 *sap, *dap;
 		__be32 saddr, daddr;
 
 		if (ah->ar_pln != sizeof(__be32) || ah->ar_pro != htons(ETH_P_IP))
-			return EBT_NOMATCH;
+			return false;
 		sap = skb_header_pointer(skb, sizeof(struct arphdr) +
 					ah->ar_hln, sizeof(saddr),
 					&saddr);
 		if (sap == NULL)
-			return EBT_NOMATCH;
+			return false;
 		dap = skb_header_pointer(skb, sizeof(struct arphdr) +
 					2*ah->ar_hln+sizeof(saddr),
 					sizeof(daddr), &daddr);
 		if (dap == NULL)
-			return EBT_NOMATCH;
+			return false;
 		if (info->bitmask & EBT_ARP_SRC_IP &&
 		    FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP))
-			return EBT_NOMATCH;
+			return false;
 		if (info->bitmask & EBT_ARP_DST_IP &&
 		    FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP))
-			return EBT_NOMATCH;
+			return false;
 		if (info->bitmask & EBT_ARP_GRAT &&
 		    FWINV(*dap != *sap, EBT_ARP_GRAT))
-			return EBT_NOMATCH;
+			return false;
 	}
 
 	if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) {
@@ -68,18 +69,18 @@ static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in
 		uint8_t verdict, i;
 
 		if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER))
-			return EBT_NOMATCH;
+			return false;
 		if (info->bitmask & EBT_ARP_SRC_MAC) {
 			mp = skb_header_pointer(skb, sizeof(struct arphdr),
 						sizeof(_mac), &_mac);
 			if (mp == NULL)
-				return EBT_NOMATCH;
+				return false;
 			verdict = 0;
 			for (i = 0; i < 6; i++)
 				verdict |= (mp[i] ^ info->smaddr[i]) &
 				       info->smmsk[i];
 			if (FWINV(verdict != 0, EBT_ARP_SRC_MAC))
-				return EBT_NOMATCH;
+				return false;
 		}
 
 		if (info->bitmask & EBT_ARP_DST_MAC) {
@@ -87,17 +88,17 @@ static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in
 						ah->ar_hln + ah->ar_pln,
 						sizeof(_mac), &_mac);
 			if (mp == NULL)
-				return EBT_NOMATCH;
+				return false;
 			verdict = 0;
 			for (i = 0; i < 6; i++)
 				verdict |= (mp[i] ^ info->dmaddr[i]) &
 					info->dmmsk[i];
 			if (FWINV(verdict != 0, EBT_ARP_DST_MAC))
-				return EBT_NOMATCH;
+				return false;
 		}
 	}
 
-	return EBT_MATCH;
+	return true;
 }
 
 static bool ebt_arp_check(const char *tablename, unsigned int hookmask,
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index cbf0918ec16..018782f044c 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -24,7 +24,8 @@ struct tcpudphdr {
 	__be16 dst;
 };
 
-static int ebt_filter_ip(const struct sk_buff *skb, const struct net_device *in,
+static bool ebt_filter_ip(const struct sk_buff *skb,
+   const struct net_device *in,
    const struct net_device *out, const void *data,
    unsigned int datalen)
 {
@@ -36,46 +37,46 @@ static int ebt_filter_ip(const struct sk_buff *skb, const struct net_device *in,
 
 	ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_IP_TOS &&
 	   FWINV(info->tos != ih->tos, EBT_IP_TOS))
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_IP_SOURCE &&
 	   FWINV((ih->saddr & info->smsk) !=
 	   info->saddr, EBT_IP_SOURCE))
-		return EBT_NOMATCH;
+		return false;
 	if ((info->bitmask & EBT_IP_DEST) &&
 	   FWINV((ih->daddr & info->dmsk) !=
 	   info->daddr, EBT_IP_DEST))
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_IP_PROTO) {
 		if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO))
-			return EBT_NOMATCH;
+			return false;
 		if (!(info->bitmask & EBT_IP_DPORT) &&
 		    !(info->bitmask & EBT_IP_SPORT))
-			return EBT_MATCH;
+			return true;
 		if (ntohs(ih->frag_off) & IP_OFFSET)
-			return EBT_NOMATCH;
+			return false;
 		pptr = skb_header_pointer(skb, ih->ihl*4,
 					  sizeof(_ports), &_ports);
 		if (pptr == NULL)
-			return EBT_NOMATCH;
+			return false;
 		if (info->bitmask & EBT_IP_DPORT) {
 			u32 dst = ntohs(pptr->dst);
 			if (FWINV(dst < info->dport[0] ||
 				  dst > info->dport[1],
 				  EBT_IP_DPORT))
-			return EBT_NOMATCH;
+			return false;
 		}
 		if (info->bitmask & EBT_IP_SPORT) {
 			u32 src = ntohs(pptr->src);
 			if (FWINV(src < info->sport[0] ||
 				  src > info->sport[1],
 				  EBT_IP_SPORT))
-			return EBT_NOMATCH;
+			return false;
 		}
 	}
-	return EBT_MATCH;
+	return true;
 }
 
 static bool ebt_ip_check(const char *tablename, unsigned int hookmask,
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 1230c9ee394..7fc3928e3fb 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -27,7 +27,7 @@ struct tcpudphdr {
 	__be16 dst;
 };
 
-static int ebt_filter_ip6(const struct sk_buff *skb,
+static bool ebt_filter_ip6(const struct sk_buff *skb,
    const struct net_device *in,
    const struct net_device *out, const void *data,
    unsigned int datalen)
@@ -42,54 +42,54 @@ static int ebt_filter_ip6(const struct sk_buff *skb,
 
 	ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
 	if (ih6 == NULL)
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_IP6_TCLASS &&
 	   FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS))
-		return EBT_NOMATCH;
+		return false;
 	for (i = 0; i < 4; i++)
 		tmp_addr.in6_u.u6_addr32[i] = ih6->saddr.in6_u.u6_addr32[i] &
 			info->smsk.in6_u.u6_addr32[i];
 	if (info->bitmask & EBT_IP6_SOURCE &&
 		FWINV((ipv6_addr_cmp(&tmp_addr, &info->saddr) != 0),
 			EBT_IP6_SOURCE))
-		return EBT_NOMATCH;
+		return false;
 	for (i = 0; i < 4; i++)
 		tmp_addr.in6_u.u6_addr32[i] = ih6->daddr.in6_u.u6_addr32[i] &
 			info->dmsk.in6_u.u6_addr32[i];
 	if (info->bitmask & EBT_IP6_DEST &&
 	   FWINV((ipv6_addr_cmp(&tmp_addr, &info->daddr) != 0), EBT_IP6_DEST))
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_IP6_PROTO) {
 		uint8_t nexthdr = ih6->nexthdr;
 		int offset_ph;
 
 		offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr);
 		if (offset_ph == -1)
-			return EBT_NOMATCH;
+			return false;
 		if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
-			return EBT_NOMATCH;
+			return false;
 		if (!(info->bitmask & EBT_IP6_DPORT) &&
 		    !(info->bitmask & EBT_IP6_SPORT))
-			return EBT_MATCH;
+			return true;
 		pptr = skb_header_pointer(skb, offset_ph, sizeof(_ports),
 					  &_ports);
 		if (pptr == NULL)
-			return EBT_NOMATCH;
+			return false;
 		if (info->bitmask & EBT_IP6_DPORT) {
 			u32 dst = ntohs(pptr->dst);
 			if (FWINV(dst < info->dport[0] ||
 				  dst > info->dport[1], EBT_IP6_DPORT))
-				return EBT_NOMATCH;
+				return false;
 		}
 		if (info->bitmask & EBT_IP6_SPORT) {
 			u32 src = ntohs(pptr->src);
 			if (FWINV(src < info->sport[0] ||
 				  src > info->sport[1], EBT_IP6_SPORT))
-			return EBT_NOMATCH;
+			return false;
 		}
-		return EBT_MATCH;
+		return true;
 	}
-	return EBT_MATCH;
+	return true;
 }
 
 static bool ebt_ip6_check(const char *tablename, unsigned int hookmask,
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 9b04f2be94e..925065a22a6 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -30,7 +30,7 @@ static DEFINE_SPINLOCK(limit_lock);
 
 #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
 
-static int ebt_limit_match(const struct sk_buff *skb,
+static bool ebt_limit_match(const struct sk_buff *skb,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
@@ -46,11 +46,11 @@ static int ebt_limit_match(const struct sk_buff *skb,
 		/* We're not limited. */
 		info->credit -= info->cost;
 		spin_unlock_bh(&limit_lock);
-		return EBT_MATCH;
+		return true;
 	}
 
 	spin_unlock_bh(&limit_lock);
-	return EBT_NOMATCH;
+	return false;
 }
 
 /* Precision saver. */
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index b2707d772c9..ec16c0e2868 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -12,15 +12,15 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_mark_m.h>
 
-static int ebt_filter_mark(const struct sk_buff *skb,
+static bool ebt_filter_mark(const struct sk_buff *skb,
    const struct net_device *in, const struct net_device *out, const void *data,
    unsigned int datalen)
 {
 	const struct ebt_mark_m_info *info = data;
 
 	if (info->bitmask & EBT_MARK_OR)
-		return !(!!(skb->mark & info->mask) ^ info->invert);
-	return !(((skb->mark & info->mask) == info->mark) ^ info->invert);
+		return !!(skb->mark & info->mask) ^ info->invert;
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
 
 static bool ebt_mark_check(const char *tablename, unsigned int hookmask,
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 4dcd3b86cff..74b44328436 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -12,7 +12,7 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_pkttype.h>
 
-static int ebt_filter_pkttype(const struct sk_buff *skb,
+static bool ebt_filter_pkttype(const struct sk_buff *skb,
    const struct net_device *in,
    const struct net_device *out,
    const void *data,
@@ -20,7 +20,7 @@ static int ebt_filter_pkttype(const struct sk_buff *skb,
 {
 	const struct ebt_pkttype_info *info = data;
 
-	return (skb->pkt_type != info->pkt_type) ^ info->invert;
+	return (skb->pkt_type == info->pkt_type) ^ info->invert;
 }
 
 static bool ebt_pkttype_check(const char *tablename, unsigned int hookmask,
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 37d9480a00c..7618206639e 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -40,7 +40,7 @@ struct stp_config_pdu {
 #define NR16(p) (p[0] << 8 | p[1])
 #define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3])
 
-static int ebt_filter_config(const struct ebt_stp_info *info,
+static bool ebt_filter_config(const struct ebt_stp_info *info,
    const struct stp_config_pdu *stpc)
 {
 	const struct ebt_stp_config_info *c;
@@ -51,12 +51,12 @@ static int ebt_filter_config(const struct ebt_stp_info *info,
 	c = &info->config;
 	if ((info->bitmask & EBT_STP_FLAGS) &&
 	    FWINV(c->flags != stpc->flags, EBT_STP_FLAGS))
-		return EBT_NOMATCH;
+		return false;
 	if (info->bitmask & EBT_STP_ROOTPRIO) {
 		v16 = NR16(stpc->root);
 		if (FWINV(v16 < c->root_priol ||
 		    v16 > c->root_priou, EBT_STP_ROOTPRIO))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_ROOTADDR) {
 		verdict = 0;
@@ -64,19 +64,19 @@ static int ebt_filter_config(const struct ebt_stp_info *info,
 			verdict |= (stpc->root[2+i] ^ c->root_addr[i]) &
 				   c->root_addrmsk[i];
 		if (FWINV(verdict != 0, EBT_STP_ROOTADDR))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_ROOTCOST) {
 		v32 = NR32(stpc->root_cost);
 		if (FWINV(v32 < c->root_costl ||
 		    v32 > c->root_costu, EBT_STP_ROOTCOST))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_SENDERPRIO) {
 		v16 = NR16(stpc->sender);
 		if (FWINV(v16 < c->sender_priol ||
 		    v16 > c->sender_priou, EBT_STP_SENDERPRIO))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_SENDERADDR) {
 		verdict = 0;
@@ -84,42 +84,43 @@ static int ebt_filter_config(const struct ebt_stp_info *info,
 			verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) &
 				   c->sender_addrmsk[i];
 		if (FWINV(verdict != 0, EBT_STP_SENDERADDR))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_PORT) {
 		v16 = NR16(stpc->port);
 		if (FWINV(v16 < c->portl ||
 		    v16 > c->portu, EBT_STP_PORT))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_MSGAGE) {
 		v16 = NR16(stpc->msg_age);
 		if (FWINV(v16 < c->msg_agel ||
 		    v16 > c->msg_ageu, EBT_STP_MSGAGE))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_MAXAGE) {
 		v16 = NR16(stpc->max_age);
 		if (FWINV(v16 < c->max_agel ||
 		    v16 > c->max_ageu, EBT_STP_MAXAGE))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_HELLOTIME) {
 		v16 = NR16(stpc->hello_time);
 		if (FWINV(v16 < c->hello_timel ||
 		    v16 > c->hello_timeu, EBT_STP_HELLOTIME))
-			return EBT_NOMATCH;
+			return false;
 	}
 	if (info->bitmask & EBT_STP_FWDD) {
 		v16 = NR16(stpc->forward_delay);
 		if (FWINV(v16 < c->forward_delayl ||
 		    v16 > c->forward_delayu, EBT_STP_FWDD))
-			return EBT_NOMATCH;
+			return false;
 	}
-	return EBT_MATCH;
+	return true;
 }
 
-static int ebt_filter_stp(const struct sk_buff *skb, const struct net_device *in,
+static bool ebt_filter_stp(const struct sk_buff *skb,
+   const struct net_device *in,
    const struct net_device *out, const void *data, unsigned int datalen)
 {
 	const struct ebt_stp_info *info = data;
@@ -129,15 +130,15 @@ static int ebt_filter_stp(const struct sk_buff *skb, const struct net_device *in
 
 	sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph);
 	if (sp == NULL)
-		return EBT_NOMATCH;
+		return false;
 
 	/* The stp code only considers these */
 	if (memcmp(sp, header, sizeof(header)))
-		return EBT_NOMATCH;
+		return false;
 
 	if (info->bitmask & EBT_STP_TYPE
 	    && FWINV(info->type != sp->type, EBT_STP_TYPE))
-		return EBT_NOMATCH;
+		return false;
 
 	if (sp->type == BPDU_TYPE_CONFIG &&
 	    info->bitmask & EBT_STP_CONFIG_MASK) {
@@ -147,10 +148,10 @@ static int ebt_filter_stp(const struct sk_buff *skb, const struct net_device *in
 		st = skb_header_pointer(skb, sizeof(_stph),
 					sizeof(_stpc), &_stpc);
 		if (st == NULL)
-			return EBT_NOMATCH;
+			return false;
 		return ebt_filter_config(info, st);
 	}
-	return EBT_MATCH;
+	return true;
 }
 
 static bool ebt_stp_check(const char *tablename, unsigned int hookmask,
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index fc88d5d59e0..8cc4257a1ad 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -38,9 +38,9 @@ MODULE_LICENSE("GPL");
 
 #define DEBUG_MSG(args...) if (debug) printk (KERN_DEBUG "ebt_vlan: " args)
 #define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_
-#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return EBT_NOMATCH;}
+#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; }
 
-static int
+static bool
 ebt_filter_vlan(const struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
@@ -58,7 +58,7 @@ ebt_filter_vlan(const struct sk_buff *skb,
 
 	fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
 	if (fp == NULL)
-		return EBT_NOMATCH;
+		return false;
 
 	/* Tag Control Information (TCI) consists of the following elements:
 	 * - User_priority. The user_priority field is three bits in length,
@@ -84,7 +84,7 @@ ebt_filter_vlan(const struct sk_buff *skb,
 	if (GET_BITMASK(EBT_VLAN_ENCAP))
 		EXIT_ON_MISMATCH(encap, EBT_VLAN_ENCAP);
 
-	return EBT_MATCH;
+	return true;
 }
 
 static bool
-- 
cgit v1.2.3


From 0ac6ab1f7915fc820ca0cf8f597290dbb249edcc Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:13 +0200
Subject: netfilter: Change return types of targets/watchers for Ebtables
 extensions

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_arpreply.c |  2 +-
 net/bridge/netfilter/ebt_dnat.c     |  2 +-
 net/bridge/netfilter/ebt_log.c      |  3 ++-
 net/bridge/netfilter/ebt_mark.c     |  2 +-
 net/bridge/netfilter/ebt_nflog.c    | 11 ++++++-----
 net/bridge/netfilter/ebt_redirect.c |  3 ++-
 net/bridge/netfilter/ebt_snat.c     |  2 +-
 net/bridge/netfilter/ebt_ulog.c     |  3 ++-
 8 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index c298d3deffa..b444cf835f1 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -15,7 +15,7 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_arpreply.h>
 
-static int ebt_target_reply(struct sk_buff *skb, unsigned int hooknr,
+static unsigned int ebt_target_reply(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index 6ddea2184e9..d58b9e32338 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -14,7 +14,7 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_nat.h>
 
-static int ebt_target_dnat(struct sk_buff *skb, unsigned int hooknr,
+static unsigned int ebt_target_dnat(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index f3d6d5ec2dc..2705d7a2a9b 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -192,7 +192,7 @@ out:
 
 }
 
-static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
+static unsigned int ebt_log(const struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
@@ -209,6 +209,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 	else
 		ebt_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li,
 			       info->prefix);
+	return EBT_CONTINUE;
 }
 
 static struct ebt_watcher log =
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index b85c73895ae..e4b91d8e2c6 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -18,7 +18,7 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_mark_t.h>
 
-static int ebt_target_mark(struct sk_buff *skb, unsigned int hooknr,
+static unsigned int ebt_target_mark(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index a6954eb3f58..2c75023b326 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -19,11 +19,11 @@
 #include <linux/netfilter_bridge/ebt_nflog.h>
 #include <net/netfilter/nf_log.h>
 
-static void ebt_nflog(const struct sk_buff *skb,
-		      unsigned int hooknr,
-		      const struct net_device *in,
-		      const struct net_device *out,
-		      const void *data, unsigned int datalen)
+static unsigned int ebt_nflog(const struct sk_buff *skb,
+			      unsigned int hooknr,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      const void *data, unsigned int datalen)
 {
 	struct ebt_nflog_info *info = (struct ebt_nflog_info *)data;
 	struct nf_loginfo li;
@@ -34,6 +34,7 @@ static void ebt_nflog(const struct sk_buff *skb,
 	li.u.ulog.qthreshold = info->threshold;
 
 	nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, "%s", info->prefix);
+	return EBT_CONTINUE;
 }
 
 static bool ebt_nflog_check(const char *tablename,
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index d2076f4227c..7bf1390ad97 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -15,7 +15,8 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_redirect.h>
 
-static int ebt_target_redirect(struct sk_buff *skb, unsigned int hooknr,
+static unsigned int ebt_target_redirect(struct sk_buff *skb,
+   unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 5a5a16acca0..d13f05d2620 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -16,7 +16,7 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_nat.h>
 
-static int ebt_target_snat(struct sk_buff *skb, unsigned int hooknr,
+static unsigned int ebt_target_snat(struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index e13a005f58a..5f86f555f6d 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -246,13 +246,14 @@ static void ebt_log_packet(u_int8_t pf, unsigned int hooknum,
 	ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
 }
 
-static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
+static unsigned int ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
    const struct net_device *in, const struct net_device *out,
    const void *data, unsigned int datalen)
 {
 	const struct ebt_ulog_info *uloginfo = data;
 
 	ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
+	return EBT_CONTINUE;
 }
 
 static bool ebt_ulog_check(const char *tablename, unsigned int hookmask,
-- 
cgit v1.2.3


From 001a18d369f4813ed792629ff4a9a6ade2a4a031 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:14 +0200
Subject: netfilter: add dummy members to Ebtables code to ease transition to
 Xtables

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_802_3.c    |  2 ++
 net/bridge/netfilter/ebt_among.c    |  2 ++
 net/bridge/netfilter/ebt_arp.c      |  2 ++
 net/bridge/netfilter/ebt_arpreply.c |  2 ++
 net/bridge/netfilter/ebt_dnat.c     |  2 ++
 net/bridge/netfilter/ebt_ip.c       |  2 ++
 net/bridge/netfilter/ebt_ip6.c      |  2 ++
 net/bridge/netfilter/ebt_limit.c    |  2 ++
 net/bridge/netfilter/ebt_log.c      |  2 ++
 net/bridge/netfilter/ebt_mark.c     |  2 ++
 net/bridge/netfilter/ebt_mark_m.c   |  2 ++
 net/bridge/netfilter/ebt_nflog.c    |  2 ++
 net/bridge/netfilter/ebt_pkttype.c  |  2 ++
 net/bridge/netfilter/ebt_redirect.c |  2 ++
 net/bridge/netfilter/ebt_snat.c     |  2 ++
 net/bridge/netfilter/ebt_stp.c      |  2 ++
 net/bridge/netfilter/ebt_ulog.c     |  2 ++
 net/bridge/netfilter/ebt_vlan.c     |  2 ++
 net/bridge/netfilter/ebtables.c     | 58 +++++++++++++++++++++++++++++++------
 19 files changed, 85 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 8ebe62b9bcc..f9876f22757 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -51,6 +51,8 @@ static bool ebt_802_3_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match filter_802_3 __read_mostly = {
 	.name		= EBT_802_3_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_802_3,
 	.check		= ebt_802_3_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_802_3_info)),
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index bfdc67bcbfa..568c890887b 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -211,6 +211,8 @@ ebt_among_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match filter_among __read_mostly = {
 	.name		= EBT_AMONG_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_among,
 	.check		= ebt_among_check,
 	.matchsize	= -1, /* special case */
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index f1f0bcf5524..4a5226cbab8 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -117,6 +117,8 @@ static bool ebt_arp_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match filter_arp __read_mostly = {
 	.name		= EBT_ARP_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_arp,
 	.check		= ebt_arp_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_arp_info)),
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index b444cf835f1..7ab16556800 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -76,6 +76,8 @@ static bool ebt_target_reply_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_target reply_target __read_mostly = {
 	.name		= EBT_ARPREPLY_TARGET,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_target_reply,
 	.check		= ebt_target_reply_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_arpreply_info)),
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index d58b9e32338..64838e2835a 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -46,6 +46,8 @@ static bool ebt_target_dnat_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_target dnat __read_mostly = {
 	.name		= EBT_DNAT_TARGET,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_target_dnat,
 	.check		= ebt_target_dnat_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index 018782f044c..0bef6f7bc83 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -108,6 +108,8 @@ static bool ebt_ip_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match filter_ip __read_mostly = {
 	.name		= EBT_IP_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_ip,
 	.check		= ebt_ip_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_ip_info)),
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 7fc3928e3fb..afcabe205b8 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -121,6 +121,8 @@ static bool ebt_ip6_check(const char *tablename, unsigned int hookmask,
 static struct ebt_match filter_ip6 =
 {
 	.name		= EBT_IP6_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_ip6,
 	.check		= ebt_ip6_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_ip6_info)),
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 925065a22a6..9ca0a2564c8 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -88,6 +88,8 @@ static bool ebt_limit_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match ebt_limit_reg __read_mostly = {
 	.name		= EBT_LIMIT_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_limit_match,
 	.check		= ebt_limit_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_limit_info)),
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 2705d7a2a9b..c2e1c357025 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -215,6 +215,8 @@ static unsigned int ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 static struct ebt_watcher log =
 {
 	.name		= EBT_LOG_WATCHER,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.watcher	= ebt_log,
 	.check		= ebt_log_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_log_info)),
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index e4b91d8e2c6..910721a1267 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -58,6 +58,8 @@ static bool ebt_target_mark_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_target mark_target __read_mostly = {
 	.name		= EBT_MARK_TARGET,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_target_mark,
 	.check		= ebt_target_mark_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_mark_t_info)),
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index ec16c0e2868..6512ad9b409 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -39,6 +39,8 @@ static bool ebt_mark_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match filter_mark __read_mostly = {
 	.name		= EBT_MARK_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_mark,
 	.check		= ebt_mark_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_mark_m_info)),
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 2c75023b326..aa0410c69a6 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -52,6 +52,8 @@ static bool ebt_nflog_check(const char *tablename,
 
 static struct ebt_watcher nflog __read_mostly = {
 	.name = EBT_NFLOG_WATCHER,
+	.revision = 0,
+	.family = NFPROTO_BRIDGE,
 	.watcher = ebt_nflog,
 	.check = ebt_nflog_check,
 	.targetsize = XT_ALIGN(sizeof(struct ebt_nflog_info)),
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 74b44328436..a9acecc88e9 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -36,6 +36,8 @@ static bool ebt_pkttype_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match filter_pkttype __read_mostly = {
 	.name		= EBT_PKTTYPE_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_pkttype,
 	.check		= ebt_pkttype_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_pkttype_info)),
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 7bf1390ad97..4c628108bcd 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -52,6 +52,8 @@ static bool ebt_target_redirect_check(const char *tablename, unsigned int hookma
 
 static struct ebt_target redirect_target __read_mostly = {
 	.name		= EBT_REDIRECT_TARGET,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_target_redirect,
 	.check		= ebt_target_redirect_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_redirect_info)),
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index d13f05d2620..0e83de781c0 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -68,6 +68,8 @@ static bool ebt_target_snat_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_target snat __read_mostly = {
 	.name		= EBT_SNAT_TARGET,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_target_snat,
 	.check		= ebt_target_snat_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 7618206639e..e6d8f0c140a 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -174,6 +174,8 @@ static bool ebt_stp_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_match filter_stp __read_mostly = {
 	.name		= EBT_STP_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_stp,
 	.check		= ebt_stp_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_stp_info)),
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 5f86f555f6d..076b44590f1 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -274,6 +274,8 @@ static bool ebt_ulog_check(const char *tablename, unsigned int hookmask,
 
 static struct ebt_watcher ulog __read_mostly = {
 	.name		= EBT_ULOG_WATCHER,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.watcher	= ebt_ulog,
 	.check		= ebt_ulog_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_ulog_info)),
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 8cc4257a1ad..9e3a39ae466 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -164,6 +164,8 @@ ebt_check_vlan(const char *tablename,
 
 static struct ebt_match filter_vlan __read_mostly = {
 	.name		= EBT_VLAN_MATCH,
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_filter_vlan,
 	.check		= ebt_check_vlan,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_vlan_info)),
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index fe499527729..bc4b3f4f37c 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -61,7 +61,9 @@ static LIST_HEAD(ebt_matches);
 static LIST_HEAD(ebt_watchers);
 
 static struct ebt_target ebt_standard_target = {
-	.name = "standard",
+	.name       = "standard",
+	.revision   = 0,
+	.family     = NFPROTO_BRIDGE,
 };
 
 static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
@@ -352,6 +354,17 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 		return -ENOENT;
 	}
 	mutex_unlock(&ebt_mutex);
+	if (match->family != NFPROTO_BRIDGE) {
+		printk(KERN_WARNING "ebtables: %s match: not for ebtables?\n",
+		       match->name);
+		goto out;
+	}
+	if (match->revision != 0) {
+		printk(KERN_WARNING "ebtables: %s match: ebtables is not "
+		       "supporting revisions at this time\n",
+		       match->name);
+		goto out;
+	}
 	if (XT_ALIGN(match->matchsize) != m->match_size &&
 	    match->matchsize != -1) {
 		/*
@@ -361,17 +374,18 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 		printk(KERN_WARNING "ebtables: %s match: "
 		       "invalid size %Zu != %u\n",
 		       match->name, XT_ALIGN(match->matchsize), m->match_size);
-		module_put(match->me);
-		return -EINVAL;
+		goto out;
 	}
 	if (match->check &&
 	    !match->check(name, hookmask, e, m->data, m->match_size)) {
 		BUGPRINT("match->check failed\n");
-		module_put(match->me);
-		return -EINVAL;
+		goto out;
 	}
 	(*cnt)++;
 	return 0;
+ out:
+	module_put(match->me);
+	return -EINVAL;
 }
 
 static inline int
@@ -394,22 +408,34 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
 		return -ENOENT;
 	}
 	mutex_unlock(&ebt_mutex);
+	if (watcher->family != NFPROTO_BRIDGE) {
+		printk(KERN_WARNING "ebtables: %s watcher: not for ebtables?\n",
+		       watcher->name);
+		goto out;
+	}
+	if (watcher->revision != 0) {
+		printk(KERN_WARNING "ebtables: %s watcher: ebtables is not "
+		       "supporting revisions at this time\n",
+		       watcher->name);
+		goto out;
+	}
 	if (XT_ALIGN(watcher->targetsize) != w->watcher_size) {
 		printk(KERN_WARNING "ebtables: %s watcher: "
 		       "invalid size %Zu != %u\n",
 		       watcher->name, XT_ALIGN(watcher->targetsize),
 		       w->watcher_size);
-		module_put(watcher->me);
-		return -EINVAL;
+		goto out;
 	}
 	if (watcher->check &&
 	    !watcher->check(name, hookmask, e, w->data, w->watcher_size)) {
 		BUGPRINT("watcher->check failed\n");
-		module_put(watcher->me);
-		return -EINVAL;
+		goto out;
 	}
 	(*cnt)++;
 	return 0;
+ out:
+	module_put(watcher->me);
+	return -EINVAL;
 }
 
 static int ebt_verify_pointers(struct ebt_replace *repl,
@@ -690,6 +716,20 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	}
 	mutex_unlock(&ebt_mutex);
 
+	if (target->family != NFPROTO_BRIDGE) {
+		printk(KERN_WARNING "ebtables: %s target: not for ebtables?\n",
+		       target->name);
+		ret = -EINVAL;
+		goto cleanup_watchers;
+	}
+	if (target->revision != 0) {
+		printk(KERN_WARNING "ebtables: %s target: ebtables is not "
+		       "supporting revisions at this time\n",
+		       target->name);
+		ret = -EINVAL;
+		goto cleanup_watchers;
+	}
+
 	t->u.target = target;
 	if (t->u.target == &ebt_standard_target) {
 		if (gap < sizeof(struct ebt_standard_target)) {
-- 
cgit v1.2.3


From 815377fe344c799228ca6278613ca3100b069ad5 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:14 +0200
Subject: netfilter: ebt_among: obtain match size through different means

The function signatures will be changed to match those of Xtables, and
the datalen argument will be gone. ebt_among unfortunately relies on
it, so we need to obtain it somehow.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_among.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 568c890887b..88b5c9118a7 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -178,6 +178,8 @@ ebt_among_check(const char *tablename, unsigned int hookmask,
 		const struct ebt_entry *e, void *data,
 		unsigned int datalen)
 {
+	const struct ebt_entry_match *em =
+		container_of(data, const struct ebt_entry_match, data);
 	const struct ebt_among_info *info = data;
 	int expected_length = sizeof(struct ebt_among_info);
 	const struct ebt_mac_wormhash *wh_dst, *wh_src;
@@ -188,11 +190,11 @@ ebt_among_check(const char *tablename, unsigned int hookmask,
 	expected_length += ebt_mac_wormhash_size(wh_dst);
 	expected_length += ebt_mac_wormhash_size(wh_src);
 
-	if (datalen != EBT_ALIGN(expected_length)) {
+	if (em->match_size != EBT_ALIGN(expected_length)) {
 		printk(KERN_WARNING
 		       "ebtables: among: wrong size: %d "
 		       "against expected %d, rounded to %Zd\n",
-		       datalen, expected_length,
+		       em->match_size, expected_length,
 		       EBT_ALIGN(expected_length));
 		return false;
 	}
-- 
cgit v1.2.3


From 2d06d4a5cc107046508d860a0b47dbc43b829b79 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:15 +0200
Subject: netfilter: change Ebtables function signatures to match Xtables's

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_802_3.c    | 18 ++++++++++--------
 net/bridge/netfilter/ebt_among.c    | 18 +++++++++---------
 net/bridge/netfilter/ebt_arp.c      | 18 +++++++++++-------
 net/bridge/netfilter/ebt_arpreply.c | 18 +++++++++++-------
 net/bridge/netfilter/ebt_dnat.c     | 17 ++++++++++-------
 net/bridge/netfilter/ebt_ip.c       | 19 +++++++++++--------
 net/bridge/netfilter/ebt_ip6.c      | 19 +++++++++++--------
 net/bridge/netfilter/ebt_limit.c    | 17 ++++++++++-------
 net/bridge/netfilter/ebt_log.c      | 17 ++++++++++-------
 net/bridge/netfilter/ebt_mark.c     | 17 ++++++++++-------
 net/bridge/netfilter/ebt_mark_m.c   | 17 ++++++++++-------
 net/bridge/netfilter/ebt_nflog.c    | 21 ++++++++++-----------
 net/bridge/netfilter/ebt_pkttype.c  | 20 +++++++++++---------
 net/bridge/netfilter/ebt_redirect.c | 18 ++++++++++--------
 net/bridge/netfilter/ebt_snat.c     | 17 ++++++++++-------
 net/bridge/netfilter/ebt_stp.c      | 18 +++++++++++-------
 net/bridge/netfilter/ebt_ulog.c     | 17 ++++++++++-------
 net/bridge/netfilter/ebt_vlan.c     | 18 +++++++++---------
 net/bridge/netfilter/ebtables.c     | 30 ++++++++++++++----------------
 19 files changed, 198 insertions(+), 156 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index f9876f22757..6f1a69c28ed 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -12,9 +12,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_802_3.h>
 
-static bool ebt_filter_802_3(const struct sk_buff *skb,
-   const struct net_device *in,
-   const struct net_device *out, const void *data, unsigned int datalen)
+static bool
+ebt_802_3_mt(const struct sk_buff *skb, const struct net_device *in,
+	     const struct net_device *out, const struct xt_match *match,
+	     const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_802_3_info *info = data;
 	const struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb);
@@ -37,9 +38,10 @@ static bool ebt_filter_802_3(const struct sk_buff *skb,
 	return true;
 }
 
-static struct ebt_match filter_802_3;
-static bool ebt_802_3_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_802_3_mt_check(const char *table, const void *entry,
+		   const struct xt_match *match, void *data,
+		   unsigned int hook_mask)
 {
 	const struct ebt_802_3_info *info = data;
 
@@ -53,8 +55,8 @@ static struct ebt_match filter_802_3 __read_mostly = {
 	.name		= EBT_802_3_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_802_3,
-	.check		= ebt_802_3_check,
+	.match		= ebt_802_3_mt,
+	.checkentry	= ebt_802_3_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_802_3_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 88b5c9118a7..84a306f085b 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -127,10 +127,10 @@ static int get_ip_src(const struct sk_buff *skb, __be32 *addr)
 	return 0;
 }
 
-static bool ebt_filter_among(const struct sk_buff *skb,
-			     const struct net_device *in,
-			     const struct net_device *out, const void *data,
-			     unsigned int datalen)
+static bool
+ebt_among_mt(const struct sk_buff *skb, const struct net_device *in,
+	     const struct net_device *out, const struct xt_match *match,
+	     const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_among_info *info = data;
 	const char *dmac, *smac;
@@ -174,9 +174,9 @@ static bool ebt_filter_among(const struct sk_buff *skb,
 }
 
 static bool
-ebt_among_check(const char *tablename, unsigned int hookmask,
-		const struct ebt_entry *e, void *data,
-		unsigned int datalen)
+ebt_among_mt_check(const char *table, const void *entry,
+		   const struct xt_match *match, void *data,
+		   unsigned int hook_mask)
 {
 	const struct ebt_entry_match *em =
 		container_of(data, const struct ebt_entry_match, data);
@@ -215,8 +215,8 @@ static struct ebt_match filter_among __read_mostly = {
 	.name		= EBT_AMONG_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_among,
-	.check		= ebt_among_check,
+	.match		= ebt_among_mt,
+	.checkentry	= ebt_among_mt_check,
 	.matchsize	= -1, /* special case */
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index 4a5226cbab8..6e7cd2f5ad7 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -15,9 +15,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_arp.h>
 
-static bool ebt_filter_arp(const struct sk_buff *skb,
-   const struct net_device *in,
-   const struct net_device *out, const void *data, unsigned int datalen)
+static bool
+ebt_arp_mt(const struct sk_buff *skb, const struct net_device *in,
+	   const struct net_device *out, const struct xt_match *match,
+	   const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_arp_info *info = data;
 	const struct arphdr *ah;
@@ -101,10 +102,13 @@ static bool ebt_filter_arp(const struct sk_buff *skb,
 	return true;
 }
 
-static bool ebt_arp_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_arp_mt_check(const char *table, const void *entry,
+		 const struct xt_match *match, void *data,
+		 unsigned int hook_mask)
 {
 	const struct ebt_arp_info *info = data;
+	const struct ebt_entry *e = entry;
 
 	if ((e->ethproto != htons(ETH_P_ARP) &&
 	   e->ethproto != htons(ETH_P_RARP)) ||
@@ -119,8 +123,8 @@ static struct ebt_match filter_arp __read_mostly = {
 	.name		= EBT_ARP_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_arp,
-	.check		= ebt_arp_check,
+	.match		= ebt_arp_mt,
+	.checkentry	= ebt_arp_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_arp_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 7ab16556800..6f2f6589777 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -15,9 +15,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_arpreply.h>
 
-static unsigned int ebt_target_reply(struct sk_buff *skb, unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static unsigned int
+ebt_arpreply_tg(struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, unsigned int hook_nr,
+		const struct xt_target *target, const void *data)
 {
 	struct ebt_arpreply_info *info = (void *)data;
 	const __be32 *siptr, *diptr;
@@ -58,10 +59,13 @@ static unsigned int ebt_target_reply(struct sk_buff *skb, unsigned int hooknr,
 	return info->target;
 }
 
-static bool ebt_target_reply_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_arpreply_tg_check(const char *tablename, const void *entry,
+		      const struct xt_target *target, void *data,
+		      unsigned int hookmask)
 {
 	const struct ebt_arpreply_info *info = data;
+	const struct ebt_entry *e = entry;
 
 	if (BASE_CHAIN && info->target == EBT_RETURN)
 		return false;
@@ -78,8 +82,8 @@ static struct ebt_target reply_target __read_mostly = {
 	.name		= EBT_ARPREPLY_TARGET,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.target		= ebt_target_reply,
-	.check		= ebt_target_reply_check,
+	.target		= ebt_arpreply_tg,
+	.checkentry	= ebt_arpreply_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_arpreply_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index 64838e2835a..b7cc013bd37 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -14,9 +14,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_nat.h>
 
-static unsigned int ebt_target_dnat(struct sk_buff *skb, unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static unsigned int
+ebt_dnat_tg(struct sk_buff *skb, const struct net_device *in,
+	    const struct net_device *out, unsigned int hook_nr,
+	    const struct xt_target *target, const void *data)
 {
 	const struct ebt_nat_info *info = data;
 
@@ -27,8 +28,10 @@ static unsigned int ebt_target_dnat(struct sk_buff *skb, unsigned int hooknr,
 	return info->target;
 }
 
-static bool ebt_target_dnat_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_dnat_tg_check(const char *tablename, const void *entry,
+		  const struct xt_target *target, void *data,
+		  unsigned int hookmask)
 {
 	const struct ebt_nat_info *info = data;
 
@@ -48,8 +51,8 @@ static struct ebt_target dnat __read_mostly = {
 	.name		= EBT_DNAT_TARGET,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.target		= ebt_target_dnat,
-	.check		= ebt_target_dnat_check,
+	.target		= ebt_dnat_tg,
+	.checkentry	= ebt_dnat_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index 0bef6f7bc83..e7f3b1776b0 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -24,10 +24,10 @@ struct tcpudphdr {
 	__be16 dst;
 };
 
-static bool ebt_filter_ip(const struct sk_buff *skb,
-   const struct net_device *in,
-   const struct net_device *out, const void *data,
-   unsigned int datalen)
+static bool
+ebt_ip_mt(const struct sk_buff *skb, const struct net_device *in,
+	  const struct net_device *out, const struct xt_match *match,
+	  const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_ip_info *info = data;
 	const struct iphdr *ih;
@@ -79,10 +79,13 @@ static bool ebt_filter_ip(const struct sk_buff *skb,
 	return true;
 }
 
-static bool ebt_ip_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_ip_mt_check(const char *table, const void *entry,
+		const struct xt_match *match, void *data,
+		unsigned int hook_mask)
 {
 	const struct ebt_ip_info *info = data;
+	const struct ebt_entry *e = entry;
 
 	if (e->ethproto != htons(ETH_P_IP) ||
 	   e->invflags & EBT_IPROTO)
@@ -110,8 +113,8 @@ static struct ebt_match filter_ip __read_mostly = {
 	.name		= EBT_IP_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_ip,
-	.check		= ebt_ip_check,
+	.match		= ebt_ip_mt,
+	.checkentry	= ebt_ip_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_ip_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index afcabe205b8..807685da293 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -27,10 +27,10 @@ struct tcpudphdr {
 	__be16 dst;
 };
 
-static bool ebt_filter_ip6(const struct sk_buff *skb,
-   const struct net_device *in,
-   const struct net_device *out, const void *data,
-   unsigned int datalen)
+static bool
+ebt_ip6_mt(const struct sk_buff *skb, const struct net_device *in,
+	   const struct net_device *out, const struct xt_match *match,
+	   const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_ip6_info *info = (struct ebt_ip6_info *)data;
 	const struct ipv6hdr *ih6;
@@ -92,9 +92,12 @@ static bool ebt_filter_ip6(const struct sk_buff *skb,
 	return true;
 }
 
-static bool ebt_ip6_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_ip6_mt_check(const char *table, const void *entry,
+		 const struct xt_match *match, void *data,
+		 unsigned int hook_mask)
 {
+	const struct ebt_entry *e = entry;
 	struct ebt_ip6_info *info = (struct ebt_ip6_info *)data;
 
 	if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
@@ -123,8 +126,8 @@ static struct ebt_match filter_ip6 =
 	.name		= EBT_IP6_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_ip6,
-	.check		= ebt_ip6_check,
+	.match		= ebt_ip6_mt,
+	.checkentry	= ebt_ip6_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_ip6_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 9ca0a2564c8..d3372739227 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -30,9 +30,10 @@ static DEFINE_SPINLOCK(limit_lock);
 
 #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
 
-static bool ebt_limit_match(const struct sk_buff *skb,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static bool
+ebt_limit_mt(const struct sk_buff *skb, const struct net_device *in,
+	     const struct net_device *out, const struct xt_match *match,
+	     const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	struct ebt_limit_info *info = (struct ebt_limit_info *)data;
 	unsigned long now = jiffies;
@@ -65,8 +66,10 @@ user2credits(u_int32_t user)
 	return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE;
 }
 
-static bool ebt_limit_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_limit_mt_check(const char *table, const void *e,
+		   const struct xt_match *match, void *data,
+		   unsigned int hook_mask)
 {
 	struct ebt_limit_info *info = data;
 
@@ -90,8 +93,8 @@ static struct ebt_match ebt_limit_reg __read_mostly = {
 	.name		= EBT_LIMIT_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_limit_match,
-	.check		= ebt_limit_check,
+	.match		= ebt_limit_mt,
+	.checkentry	= ebt_limit_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_limit_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index c2e1c357025..424dfdf7f27 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -24,8 +24,10 @@
 
 static DEFINE_SPINLOCK(ebt_log_lock);
 
-static bool ebt_log_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_log_tg_check(const char *table, const void *entry,
+		 const struct xt_target *target, void *data,
+		 unsigned int hook_mask)
 {
 	struct ebt_log_info *info = data;
 
@@ -192,9 +194,10 @@ out:
 
 }
 
-static unsigned int ebt_log(const struct sk_buff *skb, unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static unsigned int
+ebt_log_tg(struct sk_buff *skb, const struct net_device *in,
+	   const struct net_device *out, unsigned int hooknr,
+	   const struct xt_target *target, const void *data)
 {
 	const struct ebt_log_info *info = data;
 	struct nf_loginfo li;
@@ -217,8 +220,8 @@ static struct ebt_watcher log =
 	.name		= EBT_LOG_WATCHER,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.watcher	= ebt_log,
-	.check		= ebt_log_check,
+	.target		= ebt_log_tg,
+	.checkentry	= ebt_log_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_log_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 910721a1267..92c67271bd8 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -18,9 +18,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_mark_t.h>
 
-static unsigned int ebt_target_mark(struct sk_buff *skb, unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static unsigned int
+ebt_mark_tg(struct sk_buff *skb, const struct net_device *in,
+	    const struct net_device *out, unsigned int hook_nr,
+	    const struct xt_target *target, const void *data)
 {
 	const struct ebt_mark_t_info *info = data;
 	int action = info->target & -16;
@@ -37,8 +38,10 @@ static unsigned int ebt_target_mark(struct sk_buff *skb, unsigned int hooknr,
 	return info->target | ~EBT_VERDICT_BITS;
 }
 
-static bool ebt_target_mark_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_mark_tg_check(const char *table, const void *e,
+		  const struct xt_target *target, void *data,
+		  unsigned int hookmask)
 {
 	const struct ebt_mark_t_info *info = data;
 	int tmp;
@@ -60,8 +63,8 @@ static struct ebt_target mark_target __read_mostly = {
 	.name		= EBT_MARK_TARGET,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.target		= ebt_target_mark,
-	.check		= ebt_target_mark_check,
+	.target		= ebt_mark_tg,
+	.checkentry	= ebt_mark_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_mark_t_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index 6512ad9b409..db64a0de4f7 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -12,9 +12,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_mark_m.h>
 
-static bool ebt_filter_mark(const struct sk_buff *skb,
-   const struct net_device *in, const struct net_device *out, const void *data,
-   unsigned int datalen)
+static bool
+ebt_mark_mt(const struct sk_buff *skb, const struct net_device *in,
+	    const struct net_device *out, const struct xt_match *match,
+	    const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_mark_m_info *info = data;
 
@@ -23,8 +24,10 @@ static bool ebt_filter_mark(const struct sk_buff *skb,
 	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
 
-static bool ebt_mark_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_mark_mt_check(const char *table, const void *e,
+		  const struct xt_match *match, void *data,
+		  unsigned int hook_mask)
 {
 	const struct ebt_mark_m_info *info = data;
 
@@ -41,8 +44,8 @@ static struct ebt_match filter_mark __read_mostly = {
 	.name		= EBT_MARK_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_mark,
-	.check		= ebt_mark_check,
+	.match		= ebt_mark_mt,
+	.checkentry	= ebt_mark_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_mark_m_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index aa0410c69a6..b415f887188 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -19,11 +19,10 @@
 #include <linux/netfilter_bridge/ebt_nflog.h>
 #include <net/netfilter/nf_log.h>
 
-static unsigned int ebt_nflog(const struct sk_buff *skb,
-			      unsigned int hooknr,
-			      const struct net_device *in,
-			      const struct net_device *out,
-			      const void *data, unsigned int datalen)
+static unsigned int
+ebt_nflog_tg(struct sk_buff *skb, const struct net_device *in,
+	     const struct net_device *out, unsigned int hooknr,
+	     const struct xt_target *target, const void *data)
 {
 	struct ebt_nflog_info *info = (struct ebt_nflog_info *)data;
 	struct nf_loginfo li;
@@ -37,10 +36,10 @@ static unsigned int ebt_nflog(const struct sk_buff *skb,
 	return EBT_CONTINUE;
 }
 
-static bool ebt_nflog_check(const char *tablename,
-			    unsigned int hookmask,
-			    const struct ebt_entry *e,
-			    void *data, unsigned int datalen)
+static bool
+ebt_nflog_tg_check(const char *table, const void *e,
+		   const struct xt_target *target, void *data,
+		   unsigned int hookmask)
 {
 	struct ebt_nflog_info *info = (struct ebt_nflog_info *)data;
 
@@ -54,8 +53,8 @@ static struct ebt_watcher nflog __read_mostly = {
 	.name = EBT_NFLOG_WATCHER,
 	.revision = 0,
 	.family = NFPROTO_BRIDGE,
-	.watcher = ebt_nflog,
-	.check = ebt_nflog_check,
+	.target = ebt_nflog_tg,
+	.checkentry = ebt_nflog_tg_check,
 	.targetsize = XT_ALIGN(sizeof(struct ebt_nflog_info)),
 	.me = THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index a9acecc88e9..06393452ef9 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -12,19 +12,21 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_pkttype.h>
 
-static bool ebt_filter_pkttype(const struct sk_buff *skb,
-   const struct net_device *in,
-   const struct net_device *out,
-   const void *data,
-   unsigned int datalen)
+static bool
+ebt_pkttype_mt(const struct sk_buff *skb, const struct net_device *in,
+	       const struct net_device *out, const struct xt_match *match,
+	       const void *data, int offset, unsigned int protoff,
+	       bool *hotdrop)
 {
 	const struct ebt_pkttype_info *info = data;
 
 	return (skb->pkt_type == info->pkt_type) ^ info->invert;
 }
 
-static bool ebt_pkttype_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_pkttype_mt_check(const char *table, const void *e,
+		     const struct xt_match *match, void *data,
+		     unsigned int hook_mask)
 {
 	const struct ebt_pkttype_info *info = data;
 
@@ -38,8 +40,8 @@ static struct ebt_match filter_pkttype __read_mostly = {
 	.name		= EBT_PKTTYPE_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_pkttype,
-	.check		= ebt_pkttype_check,
+	.match		= ebt_pkttype_mt,
+	.checkentry	= ebt_pkttype_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_pkttype_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 4c628108bcd..e9540cf4f6d 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -15,10 +15,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_redirect.h>
 
-static unsigned int ebt_target_redirect(struct sk_buff *skb,
-   unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static unsigned int
+ebt_redirect_tg(struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, unsigned int hooknr,
+		const struct xt_target *target, const void *data)
 {
 	const struct ebt_redirect_info *info = data;
 
@@ -34,8 +34,10 @@ static unsigned int ebt_target_redirect(struct sk_buff *skb,
 	return info->target;
 }
 
-static bool ebt_target_redirect_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_redirect_tg_check(const char *tablename, const void *e,
+		      const struct xt_target *target, void *data,
+		      unsigned int hookmask)
 {
 	const struct ebt_redirect_info *info = data;
 
@@ -54,8 +56,8 @@ static struct ebt_target redirect_target __read_mostly = {
 	.name		= EBT_REDIRECT_TARGET,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.target		= ebt_target_redirect,
-	.check		= ebt_target_redirect_check,
+	.target		= ebt_redirect_tg,
+	.checkentry	= ebt_redirect_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_redirect_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 0e83de781c0..363d0051e04 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -16,9 +16,10 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_nat.h>
 
-static unsigned int ebt_target_snat(struct sk_buff *skb, unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static unsigned int
+ebt_snat_tg(struct sk_buff *skb, const struct net_device *in,
+	    const struct net_device *out, unsigned int hook_nr,
+	    const struct xt_target *target, const void *data)
 {
 	const struct ebt_nat_info *info = data;
 
@@ -43,8 +44,10 @@ out:
 	return info->target | ~EBT_VERDICT_BITS;
 }
 
-static bool ebt_target_snat_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_snat_tg_check(const char *tablename, const void *e,
+		  const struct xt_target *target, void *data,
+		  unsigned int hookmask)
 {
 	const struct ebt_nat_info *info = data;
 	int tmp;
@@ -70,8 +73,8 @@ static struct ebt_target snat __read_mostly = {
 	.name		= EBT_SNAT_TARGET,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.target		= ebt_target_snat,
-	.check		= ebt_target_snat_check,
+	.target		= ebt_snat_tg,
+	.checkentry	= ebt_snat_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index e6d8f0c140a..7576d1d62a4 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -119,9 +119,10 @@ static bool ebt_filter_config(const struct ebt_stp_info *info,
 	return true;
 }
 
-static bool ebt_filter_stp(const struct sk_buff *skb,
-   const struct net_device *in,
-   const struct net_device *out, const void *data, unsigned int datalen)
+static bool
+ebt_stp_mt(const struct sk_buff *skb, const struct net_device *in,
+	   const struct net_device *out, const struct xt_match *match,
+	   const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_stp_info *info = data;
 	const struct stp_header *sp;
@@ -154,12 +155,15 @@ static bool ebt_filter_stp(const struct sk_buff *skb,
 	return true;
 }
 
-static bool ebt_stp_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_stp_mt_check(const char *table, const void *entry,
+		 const struct xt_match *match, void *data,
+		 unsigned int hook_mask)
 {
 	const struct ebt_stp_info *info = data;
 	const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
 	const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+	const struct ebt_entry *e = entry;
 
 	if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
 	    !(info->bitmask & EBT_STP_MASK))
@@ -176,8 +180,8 @@ static struct ebt_match filter_stp __read_mostly = {
 	.name		= EBT_STP_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_stp,
-	.check		= ebt_stp_check,
+	.match		= ebt_stp_mt,
+	.checkentry	= ebt_stp_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_stp_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 076b44590f1..77ff9c46b26 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -246,9 +246,10 @@ static void ebt_log_packet(u_int8_t pf, unsigned int hooknum,
 	ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
 }
 
-static unsigned int ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static unsigned int
+ebt_ulog_tg(struct sk_buff *skb, const struct net_device *in,
+	    const struct net_device *out, unsigned int hooknr,
+	    const struct xt_target *target, const void *data)
 {
 	const struct ebt_ulog_info *uloginfo = data;
 
@@ -256,8 +257,10 @@ static unsigned int ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
 	return EBT_CONTINUE;
 }
 
-static bool ebt_ulog_check(const char *tablename, unsigned int hookmask,
-   const struct ebt_entry *e, void *data, unsigned int datalen)
+static bool
+ebt_ulog_tg_check(const char *table, const void *entry,
+		  const struct xt_target *target, void *data,
+		  unsigned int hookmask)
 {
 	struct ebt_ulog_info *uloginfo = data;
 
@@ -276,8 +279,8 @@ static struct ebt_watcher ulog __read_mostly = {
 	.name		= EBT_ULOG_WATCHER,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.watcher	= ebt_ulog,
-	.check		= ebt_ulog_check,
+	.target		= ebt_ulog_tg,
+	.checkentry	= ebt_ulog_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_ulog_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 9e3a39ae466..3af688b0fc3 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -41,10 +41,9 @@ MODULE_LICENSE("GPL");
 #define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; }
 
 static bool
-ebt_filter_vlan(const struct sk_buff *skb,
-		const struct net_device *in,
-		const struct net_device *out,
-		const void *data, unsigned int datalen)
+ebt_vlan_mt(const struct sk_buff *skb, const struct net_device *in,
+	    const struct net_device *out, const struct xt_match *match,
+	    const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ebt_vlan_info *info = data;
 	const struct vlan_hdr *fp;
@@ -88,11 +87,12 @@ ebt_filter_vlan(const struct sk_buff *skb,
 }
 
 static bool
-ebt_check_vlan(const char *tablename,
-	       unsigned int hooknr,
-	       const struct ebt_entry *e, void *data, unsigned int datalen)
+ebt_vlan_mt_check(const char *table, const void *entry,
+		  const struct xt_match *match, void *data,
+		  unsigned int hook_mask)
 {
 	struct ebt_vlan_info *info = data;
+	const struct ebt_entry *e = entry;
 
 	/* Is it 802.1Q frame checked? */
 	if (e->ethproto != htons(ETH_P_8021Q)) {
@@ -166,8 +166,8 @@ static struct ebt_match filter_vlan __read_mostly = {
 	.name		= EBT_VLAN_MATCH,
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
-	.match		= ebt_filter_vlan,
-	.check		= ebt_check_vlan,
+	.match		= ebt_vlan_mt,
+	.checkentry	= ebt_vlan_mt_check,
 	.matchsize	= XT_ALIGN(sizeof(struct ebt_vlan_info)),
 	.me		= THIS_MODULE,
 };
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index bc4b3f4f37c..340e1c6bdcb 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -67,11 +67,10 @@ static struct ebt_target ebt_standard_target = {
 };
 
 static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
-   const struct sk_buff *skb, unsigned int hooknr, const struct net_device *in,
+   struct sk_buff *skb, unsigned int hooknr, const struct net_device *in,
    const struct net_device *out)
 {
-	w->u.watcher->watcher(skb, hooknr, in, out, w->data,
-	   w->watcher_size);
+	w->u.watcher->target(skb, in, out, hooknr, NULL, w->data);
 	/* watchers don't give a verdict */
 	return 0;
 }
@@ -80,8 +79,7 @@ static inline int ebt_do_match (struct ebt_entry_match *m,
    const struct sk_buff *skb, const struct net_device *in,
    const struct net_device *out)
 {
-	return m->u.match->match(skb, in, out, m->data,
-	   m->match_size);
+	return m->u.match->match(skb, in, out, NULL, m->data, 0, 0, NULL);
 }
 
 static inline int ebt_dev_check(char *entry, const struct net_device *device)
@@ -195,8 +193,8 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 		if (!t->u.target->target)
 			verdict = ((struct ebt_standard_target *)t)->verdict;
 		else
-			verdict = t->u.target->target(skb, hook,
-			   in, out, t->data, t->target_size);
+			verdict = t->u.target->target(skb, in, out, hook,
+				  NULL, t->data);
 		if (verdict == EBT_ACCEPT) {
 			read_unlock_bh(&table->lock);
 			return NF_ACCEPT;
@@ -376,8 +374,8 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 		       match->name, XT_ALIGN(match->matchsize), m->match_size);
 		goto out;
 	}
-	if (match->check &&
-	    !match->check(name, hookmask, e, m->data, m->match_size)) {
+	if (match->checkentry &&
+	    !match->checkentry(name, e, NULL, m->data, hookmask)) {
 		BUGPRINT("match->check failed\n");
 		goto out;
 	}
@@ -426,8 +424,8 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
 		       w->watcher_size);
 		goto out;
 	}
-	if (watcher->check &&
-	    !watcher->check(name, hookmask, e, w->data, w->watcher_size)) {
+	if (watcher->checkentry &&
+	    !watcher->checkentry(name, e, NULL, w->data, hookmask)) {
 		BUGPRINT("watcher->check failed\n");
 		goto out;
 	}
@@ -609,7 +607,7 @@ ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i)
 	if (i && (*i)-- == 0)
 		return 1;
 	if (m->u.match->destroy)
-		m->u.match->destroy(m->data, m->match_size);
+		m->u.match->destroy(NULL, m->data);
 	module_put(m->u.match->me);
 
 	return 0;
@@ -621,7 +619,7 @@ ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i)
 	if (i && (*i)-- == 0)
 		return 1;
 	if (w->u.watcher->destroy)
-		w->u.watcher->destroy(w->data, w->watcher_size);
+		w->u.watcher->destroy(NULL, w->data);
 	module_put(w->u.watcher->me);
 
 	return 0;
@@ -641,7 +639,7 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt)
 	EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL);
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
 	if (t->u.target->destroy)
-		t->u.target->destroy(t->data, t->target_size);
+		t->u.target->destroy(NULL, t->data);
 	module_put(t->u.target->me);
 
 	return 0;
@@ -755,8 +753,8 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 		module_put(t->u.target->me);
 		ret = -EINVAL;
 		goto cleanup_watchers;
-	} else if (t->u.target->check &&
-	    !t->u.target->check(name, hookmask, e, t->data, t->target_size)) {
+	} else if (t->u.target->checkentry &&
+	    !t->u.target->checkentry(name, e, NULL, t->data, hookmask)) {
 		module_put(t->u.target->me);
 		ret = -EFAULT;
 		goto cleanup_watchers;
-- 
cgit v1.2.3


From 043ef46c7690bfdbd5b012e15812a14a19ca5604 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:15 +0200
Subject: netfilter: move Ebtables to use Xtables

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/Kconfig        |   1 +
 net/bridge/netfilter/ebt_802_3.c    |   8 +-
 net/bridge/netfilter/ebt_among.c    |  14 +--
 net/bridge/netfilter/ebt_arp.c      |   8 +-
 net/bridge/netfilter/ebt_arpreply.c |   8 +-
 net/bridge/netfilter/ebt_dnat.c     |   8 +-
 net/bridge/netfilter/ebt_ip.c       |   8 +-
 net/bridge/netfilter/ebt_ip6.c      |   9 +-
 net/bridge/netfilter/ebt_limit.c    |   8 +-
 net/bridge/netfilter/ebt_log.c      |   9 +-
 net/bridge/netfilter/ebt_mark.c     |   8 +-
 net/bridge/netfilter/ebt_mark_m.c   |   8 +-
 net/bridge/netfilter/ebt_nflog.c    |  16 ++--
 net/bridge/netfilter/ebt_pkttype.c  |   8 +-
 net/bridge/netfilter/ebt_redirect.c |   8 +-
 net/bridge/netfilter/ebt_snat.c     |   8 +-
 net/bridge/netfilter/ebt_stp.c      |   8 +-
 net/bridge/netfilter/ebt_ulog.c     |  10 +--
 net/bridge/netfilter/ebt_vlan.c     |   8 +-
 net/bridge/netfilter/ebtables.c     | 173 ++++++++++++++----------------------
 net/netfilter/x_tables.c            |   9 +-
 21 files changed, 155 insertions(+), 190 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 90947979499..e7c197ffb2f 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -7,6 +7,7 @@ menu "Bridge: Netfilter Configuration"
 
 config BRIDGE_NF_EBTABLES
 	tristate "Ethernet Bridge tables (ebtables) support"
+	select NETFILTER_XTABLES
 	help
 	  ebtables is a general, extensible frame/packet identification
 	  framework. Say 'Y' or 'M' here if you want to do Ethernet
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 6f1a69c28ed..6fc2a59e09a 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -51,8 +51,8 @@ ebt_802_3_mt_check(const char *table, const void *entry,
 	return true;
 }
 
-static struct ebt_match filter_802_3 __read_mostly = {
-	.name		= EBT_802_3_MATCH,
+static struct xt_match ebt_802_3_mt_reg __read_mostly = {
+	.name		= "802_3",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_802_3_mt,
@@ -63,12 +63,12 @@ static struct ebt_match filter_802_3 __read_mostly = {
 
 static int __init ebt_802_3_init(void)
 {
-	return ebt_register_match(&filter_802_3);
+	return xt_register_match(&ebt_802_3_mt_reg);
 }
 
 static void __exit ebt_802_3_fini(void)
 {
-	ebt_unregister_match(&filter_802_3);
+	xt_unregister_match(&ebt_802_3_mt_reg);
 }
 
 module_init(ebt_802_3_init);
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 84a306f085b..084559e1840 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -7,12 +7,12 @@
  *  August, 2003
  *
  */
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_among.h>
 #include <linux/ip.h>
 #include <linux/if_arp.h>
 #include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_among.h>
 
 static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
 				      const char *mac, __be32 ip)
@@ -211,8 +211,8 @@ ebt_among_mt_check(const char *table, const void *entry,
 	return true;
 }
 
-static struct ebt_match filter_among __read_mostly = {
-	.name		= EBT_AMONG_MATCH,
+static struct xt_match ebt_among_mt_reg __read_mostly = {
+	.name		= "among",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_among_mt,
@@ -223,12 +223,12 @@ static struct ebt_match filter_among __read_mostly = {
 
 static int __init ebt_among_init(void)
 {
-	return ebt_register_match(&filter_among);
+	return xt_register_match(&ebt_among_mt_reg);
 }
 
 static void __exit ebt_among_fini(void)
 {
-	ebt_unregister_match(&filter_among);
+	xt_unregister_match(&ebt_among_mt_reg);
 }
 
 module_init(ebt_among_init);
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index 6e7cd2f5ad7..a073dffe7a1 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -119,8 +119,8 @@ ebt_arp_mt_check(const char *table, const void *entry,
 	return true;
 }
 
-static struct ebt_match filter_arp __read_mostly = {
-	.name		= EBT_ARP_MATCH,
+static struct xt_match ebt_arp_mt_reg __read_mostly = {
+	.name		= "arp",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_arp_mt,
@@ -131,12 +131,12 @@ static struct ebt_match filter_arp __read_mostly = {
 
 static int __init ebt_arp_init(void)
 {
-	return ebt_register_match(&filter_arp);
+	return xt_register_match(&ebt_arp_mt_reg);
 }
 
 static void __exit ebt_arp_fini(void)
 {
-	ebt_unregister_match(&filter_arp);
+	xt_unregister_match(&ebt_arp_mt_reg);
 }
 
 module_init(ebt_arp_init);
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 6f2f6589777..8071b64af46 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -78,8 +78,8 @@ ebt_arpreply_tg_check(const char *tablename, const void *entry,
 	return true;
 }
 
-static struct ebt_target reply_target __read_mostly = {
-	.name		= EBT_ARPREPLY_TARGET,
+static struct xt_target ebt_arpreply_tg_reg __read_mostly = {
+	.name		= "arpreply",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_arpreply_tg,
@@ -90,12 +90,12 @@ static struct ebt_target reply_target __read_mostly = {
 
 static int __init ebt_arpreply_init(void)
 {
-	return ebt_register_target(&reply_target);
+	return xt_register_target(&ebt_arpreply_tg_reg);
 }
 
 static void __exit ebt_arpreply_fini(void)
 {
-	ebt_unregister_target(&reply_target);
+	xt_unregister_target(&ebt_arpreply_tg_reg);
 }
 
 module_init(ebt_arpreply_init);
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index b7cc013bd37..d2211c4a477 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -47,8 +47,8 @@ ebt_dnat_tg_check(const char *tablename, const void *entry,
 	return true;
 }
 
-static struct ebt_target dnat __read_mostly = {
-	.name		= EBT_DNAT_TARGET,
+static struct xt_target ebt_dnat_tg_reg __read_mostly = {
+	.name		= "dnat",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_dnat_tg,
@@ -59,12 +59,12 @@ static struct ebt_target dnat __read_mostly = {
 
 static int __init ebt_dnat_init(void)
 {
-	return ebt_register_target(&dnat);
+	return xt_register_target(&ebt_dnat_tg_reg);
 }
 
 static void __exit ebt_dnat_fini(void)
 {
-	ebt_unregister_target(&dnat);
+	xt_unregister_target(&ebt_dnat_tg_reg);
 }
 
 module_init(ebt_dnat_init);
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index e7f3b1776b0..b42c7ce799b 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -109,8 +109,8 @@ ebt_ip_mt_check(const char *table, const void *entry,
 	return true;
 }
 
-static struct ebt_match filter_ip __read_mostly = {
-	.name		= EBT_IP_MATCH,
+static struct xt_match ebt_ip_mt_reg __read_mostly = {
+	.name		= "ip",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_ip_mt,
@@ -121,12 +121,12 @@ static struct ebt_match filter_ip __read_mostly = {
 
 static int __init ebt_ip_init(void)
 {
-	return ebt_register_match(&filter_ip);
+	return xt_register_match(&ebt_ip_mt_reg);
 }
 
 static void __exit ebt_ip_fini(void)
 {
-	ebt_unregister_match(&filter_ip);
+	xt_unregister_match(&ebt_ip_mt_reg);
 }
 
 module_init(ebt_ip_init);
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 807685da293..317e624ae59 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -121,9 +121,8 @@ ebt_ip6_mt_check(const char *table, const void *entry,
 	return true;
 }
 
-static struct ebt_match filter_ip6 =
-{
-	.name		= EBT_IP6_MATCH,
+static struct xt_match ebt_ip6_mt_reg __read_mostly = {
+	.name		= "ip6",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_ip6_mt,
@@ -134,12 +133,12 @@ static struct ebt_match filter_ip6 =
 
 static int __init ebt_ip6_init(void)
 {
-	return ebt_register_match(&filter_ip6);
+	return xt_register_match(&ebt_ip6_mt_reg);
 }
 
 static void __exit ebt_ip6_fini(void)
 {
-	ebt_unregister_match(&filter_ip6);
+	xt_unregister_match(&ebt_ip6_mt_reg);
 }
 
 module_init(ebt_ip6_init);
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index d3372739227..43d9a500363 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -89,8 +89,8 @@ ebt_limit_mt_check(const char *table, const void *e,
 	return true;
 }
 
-static struct ebt_match ebt_limit_reg __read_mostly = {
-	.name		= EBT_LIMIT_MATCH,
+static struct xt_match ebt_limit_mt_reg __read_mostly = {
+	.name		= "limit",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_limit_mt,
@@ -101,12 +101,12 @@ static struct ebt_match ebt_limit_reg __read_mostly = {
 
 static int __init ebt_limit_init(void)
 {
-	return ebt_register_match(&ebt_limit_reg);
+	return xt_register_match(&ebt_limit_mt_reg);
 }
 
 static void __exit ebt_limit_fini(void)
 {
-	ebt_unregister_match(&ebt_limit_reg);
+	xt_unregister_match(&ebt_limit_mt_reg);
 }
 
 module_init(ebt_limit_init);
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 424dfdf7f27..b40f9ed4c34 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -215,9 +215,8 @@ ebt_log_tg(struct sk_buff *skb, const struct net_device *in,
 	return EBT_CONTINUE;
 }
 
-static struct ebt_watcher log =
-{
-	.name		= EBT_LOG_WATCHER,
+static struct xt_target ebt_log_tg_reg __read_mostly = {
+	.name		= "log",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_log_tg,
@@ -236,7 +235,7 @@ static int __init ebt_log_init(void)
 {
 	int ret;
 
-	ret = ebt_register_watcher(&log);
+	ret = xt_register_target(&ebt_log_tg_reg);
 	if (ret < 0)
 		return ret;
 	nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger);
@@ -246,7 +245,7 @@ static int __init ebt_log_init(void)
 static void __exit ebt_log_fini(void)
 {
 	nf_log_unregister(&ebt_log_logger);
-	ebt_unregister_watcher(&log);
+	xt_unregister_target(&ebt_log_tg_reg);
 }
 
 module_init(ebt_log_init);
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 92c67271bd8..dff19fc91cf 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -59,8 +59,8 @@ ebt_mark_tg_check(const char *table, const void *e,
 	return true;
 }
 
-static struct ebt_target mark_target __read_mostly = {
-	.name		= EBT_MARK_TARGET,
+static struct xt_target ebt_mark_tg_reg __read_mostly = {
+	.name		= "mark",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_mark_tg,
@@ -71,12 +71,12 @@ static struct ebt_target mark_target __read_mostly = {
 
 static int __init ebt_mark_init(void)
 {
-	return ebt_register_target(&mark_target);
+	return xt_register_target(&ebt_mark_tg_reg);
 }
 
 static void __exit ebt_mark_fini(void)
 {
-	ebt_unregister_target(&mark_target);
+	xt_unregister_target(&ebt_mark_tg_reg);
 }
 
 module_init(ebt_mark_init);
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index db64a0de4f7..aa6781c7f98 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -40,8 +40,8 @@ ebt_mark_mt_check(const char *table, const void *e,
 	return true;
 }
 
-static struct ebt_match filter_mark __read_mostly = {
-	.name		= EBT_MARK_MATCH,
+static struct xt_match ebt_mark_mt_reg __read_mostly = {
+	.name		= "mark_m",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_mark_mt,
@@ -52,12 +52,12 @@ static struct ebt_match filter_mark __read_mostly = {
 
 static int __init ebt_mark_m_init(void)
 {
-	return ebt_register_match(&filter_mark);
+	return xt_register_match(&ebt_mark_mt_reg);
 }
 
 static void __exit ebt_mark_m_fini(void)
 {
-	ebt_unregister_match(&filter_mark);
+	xt_unregister_match(&ebt_mark_mt_reg);
 }
 
 module_init(ebt_mark_m_init);
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index b415f887188..917ac360079 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -49,24 +49,24 @@ ebt_nflog_tg_check(const char *table, const void *e,
 	return true;
 }
 
-static struct ebt_watcher nflog __read_mostly = {
-	.name = EBT_NFLOG_WATCHER,
-	.revision = 0,
-	.family = NFPROTO_BRIDGE,
-	.target = ebt_nflog_tg,
+static struct xt_target ebt_nflog_tg_reg __read_mostly = {
+	.name       = "nflog",
+	.revision   = 0,
+	.family     = NFPROTO_BRIDGE,
+	.target     = ebt_nflog_tg,
 	.checkentry = ebt_nflog_tg_check,
 	.targetsize = XT_ALIGN(sizeof(struct ebt_nflog_info)),
-	.me = THIS_MODULE,
+	.me         = THIS_MODULE,
 };
 
 static int __init ebt_nflog_init(void)
 {
-	return ebt_register_watcher(&nflog);
+	return xt_register_target(&ebt_nflog_tg_reg);
 }
 
 static void __exit ebt_nflog_fini(void)
 {
-	ebt_unregister_watcher(&nflog);
+	xt_unregister_target(&ebt_nflog_tg_reg);
 }
 
 module_init(ebt_nflog_init);
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 06393452ef9..1c04ce5a52c 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -36,8 +36,8 @@ ebt_pkttype_mt_check(const char *table, const void *e,
 	return true;
 }
 
-static struct ebt_match filter_pkttype __read_mostly = {
-	.name		= EBT_PKTTYPE_MATCH,
+static struct xt_match ebt_pkttype_mt_reg __read_mostly = {
+	.name		= "pkttype",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_pkttype_mt,
@@ -48,12 +48,12 @@ static struct ebt_match filter_pkttype __read_mostly = {
 
 static int __init ebt_pkttype_init(void)
 {
-	return ebt_register_match(&filter_pkttype);
+	return xt_register_match(&ebt_pkttype_mt_reg);
 }
 
 static void __exit ebt_pkttype_fini(void)
 {
-	ebt_unregister_match(&filter_pkttype);
+	xt_unregister_match(&ebt_pkttype_mt_reg);
 }
 
 module_init(ebt_pkttype_init);
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index e9540cf4f6d..1b7684ffe40 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -52,8 +52,8 @@ ebt_redirect_tg_check(const char *tablename, const void *e,
 	return true;
 }
 
-static struct ebt_target redirect_target __read_mostly = {
-	.name		= EBT_REDIRECT_TARGET,
+static struct xt_target ebt_redirect_tg_reg __read_mostly = {
+	.name		= "redirect",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_redirect_tg,
@@ -64,12 +64,12 @@ static struct ebt_target redirect_target __read_mostly = {
 
 static int __init ebt_redirect_init(void)
 {
-	return ebt_register_target(&redirect_target);
+	return xt_register_target(&ebt_redirect_tg_reg);
 }
 
 static void __exit ebt_redirect_fini(void)
 {
-	ebt_unregister_target(&redirect_target);
+	xt_unregister_target(&ebt_redirect_tg_reg);
 }
 
 module_init(ebt_redirect_init);
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 363d0051e04..c90217a4f9e 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -69,8 +69,8 @@ ebt_snat_tg_check(const char *tablename, const void *e,
 	return true;
 }
 
-static struct ebt_target snat __read_mostly = {
-	.name		= EBT_SNAT_TARGET,
+static struct xt_target ebt_snat_tg_reg __read_mostly = {
+	.name		= "snat",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_snat_tg,
@@ -81,12 +81,12 @@ static struct ebt_target snat __read_mostly = {
 
 static int __init ebt_snat_init(void)
 {
-	return ebt_register_target(&snat);
+	return xt_register_target(&ebt_snat_tg_reg);
 }
 
 static void __exit ebt_snat_fini(void)
 {
-	ebt_unregister_target(&snat);
+	xt_unregister_target(&ebt_snat_tg_reg);
 }
 
 module_init(ebt_snat_init);
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 7576d1d62a4..28bb48b67a8 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -176,8 +176,8 @@ ebt_stp_mt_check(const char *table, const void *entry,
 	return true;
 }
 
-static struct ebt_match filter_stp __read_mostly = {
-	.name		= EBT_STP_MATCH,
+static struct xt_match ebt_stp_mt_reg __read_mostly = {
+	.name		= "stp",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_stp_mt,
@@ -188,12 +188,12 @@ static struct ebt_match filter_stp __read_mostly = {
 
 static int __init ebt_stp_init(void)
 {
-	return ebt_register_match(&filter_stp);
+	return xt_register_match(&ebt_stp_mt_reg);
 }
 
 static void __exit ebt_stp_fini(void)
 {
-	ebt_unregister_match(&filter_stp);
+	xt_unregister_match(&ebt_stp_mt_reg);
 }
 
 module_init(ebt_stp_init);
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 77ff9c46b26..25ca6467349 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -275,8 +275,8 @@ ebt_ulog_tg_check(const char *table, const void *entry,
 	return 0;
 }
 
-static struct ebt_watcher ulog __read_mostly = {
-	.name		= EBT_ULOG_WATCHER,
+static struct xt_target ebt_ulog_tg_reg __read_mostly = {
+	.name		= "ulog",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.target		= ebt_ulog_tg,
@@ -286,7 +286,7 @@ static struct ebt_watcher ulog __read_mostly = {
 };
 
 static const struct nf_logger ebt_ulog_logger = {
-	.name		= EBT_ULOG_WATCHER,
+	.name		= "ulog",
 	.logfn		= &ebt_log_packet,
 	.me		= THIS_MODULE,
 };
@@ -315,7 +315,7 @@ static int __init ebt_ulog_init(void)
 		printk(KERN_WARNING KBUILD_MODNAME ": out of memory trying to "
 		       "call netlink_kernel_create\n");
 		ret = false;
-	} else if (ebt_register_watcher(&ulog) != 0) {
+	} else if (xt_register_target(&ebt_ulog_tg_reg) != 0) {
 		netlink_kernel_release(ebtulognl);
 	}
 
@@ -331,7 +331,7 @@ static void __exit ebt_ulog_fini(void)
 	int i;
 
 	nf_log_unregister(&ebt_ulog_logger);
-	ebt_unregister_watcher(&ulog);
+	xt_unregister_target(&ebt_ulog_tg_reg);
 	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
 		ub = &ulog_buffers[i];
 		if (timer_pending(&ub->timer))
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 3af688b0fc3..5addef6d62f 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -162,8 +162,8 @@ ebt_vlan_mt_check(const char *table, const void *entry,
 	return true;
 }
 
-static struct ebt_match filter_vlan __read_mostly = {
-	.name		= EBT_VLAN_MATCH,
+static struct xt_match ebt_vlan_mt_reg __read_mostly = {
+	.name		= "vlan",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
 	.match		= ebt_vlan_mt,
@@ -177,12 +177,12 @@ static int __init ebt_vlan_init(void)
 	DEBUG_MSG("ebtables 802.1Q extension module v"
 		  MODULE_VERS "\n");
 	DEBUG_MSG("module debug=%d\n", !!debug);
-	return ebt_register_match(&filter_vlan);
+	return xt_register_match(&ebt_vlan_mt_reg);
 }
 
 static void __exit ebt_vlan_fini(void)
 {
-	ebt_unregister_match(&filter_vlan);
+	xt_unregister_match(&ebt_vlan_mt_reg);
 }
 
 module_init(ebt_vlan_init);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 340e1c6bdcb..c4f7a2e8ed3 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -60,17 +60,18 @@ static LIST_HEAD(ebt_targets);
 static LIST_HEAD(ebt_matches);
 static LIST_HEAD(ebt_watchers);
 
-static struct ebt_target ebt_standard_target = {
+static struct xt_target ebt_standard_target = {
 	.name       = "standard",
 	.revision   = 0,
 	.family     = NFPROTO_BRIDGE,
+	.targetsize = sizeof(int),
 };
 
 static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
    struct sk_buff *skb, unsigned int hooknr, const struct net_device *in,
    const struct net_device *out)
 {
-	w->u.watcher->target(skb, in, out, hooknr, NULL, w->data);
+	w->u.watcher->target(skb, in, out, hooknr, w->u.watcher, w->data);
 	/* watchers don't give a verdict */
 	return 0;
 }
@@ -79,7 +80,7 @@ static inline int ebt_do_match (struct ebt_entry_match *m,
    const struct sk_buff *skb, const struct net_device *in,
    const struct net_device *out)
 {
-	return m->u.match->match(skb, in, out, NULL, m->data, 0, 0, NULL);
+	return m->u.match->match(skb, in, out, m->u.match, m->data, 0, 0, NULL);
 }
 
 static inline int ebt_dev_check(char *entry, const struct net_device *device)
@@ -194,7 +195,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 			verdict = ((struct ebt_standard_target *)t)->verdict;
 		else
 			verdict = t->u.target->target(skb, in, out, hook,
-				  NULL, t->data);
+				  t->u.target, t->data);
 		if (verdict == EBT_ACCEPT) {
 			read_unlock_bh(&table->lock);
 			return NF_ACCEPT;
@@ -336,104 +337,73 @@ static inline int
 ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
    const char *name, unsigned int hookmask, unsigned int *cnt)
 {
-	struct ebt_match *match;
+	struct xt_match *match;
 	size_t left = ((char *)e + e->watchers_offset) - (char *)m;
 	int ret;
 
 	if (left < sizeof(struct ebt_entry_match) ||
 	    left - sizeof(struct ebt_entry_match) < m->match_size)
 		return -EINVAL;
-	match = find_match_lock(m->u.name, &ret, &ebt_mutex);
-	if (!match)
-		return ret;
-	m->u.match = match;
-	if (!try_module_get(match->me)) {
-		mutex_unlock(&ebt_mutex);
+
+	match = try_then_request_module(xt_find_match(NFPROTO_BRIDGE,
+		m->u.name, 0), "ebt_%s", m->u.name);
+	if (IS_ERR(match))
+		return PTR_ERR(match);
+	if (match == NULL)
 		return -ENOENT;
-	}
-	mutex_unlock(&ebt_mutex);
-	if (match->family != NFPROTO_BRIDGE) {
-		printk(KERN_WARNING "ebtables: %s match: not for ebtables?\n",
-		       match->name);
-		goto out;
-	}
-	if (match->revision != 0) {
-		printk(KERN_WARNING "ebtables: %s match: ebtables is not "
-		       "supporting revisions at this time\n",
-		       match->name);
-		goto out;
-	}
-	if (XT_ALIGN(match->matchsize) != m->match_size &&
-	    match->matchsize != -1) {
-		/*
-		 * ebt_among is exempt from centralized matchsize checking
-		 * because it uses a dynamic-size data set.
-		 */
-		printk(KERN_WARNING "ebtables: %s match: "
-		       "invalid size %Zu != %u\n",
-		       match->name, XT_ALIGN(match->matchsize), m->match_size);
-		goto out;
-	}
-	if (match->checkentry &&
+	m->u.match = match;
+
+	ret = xt_check_match(match, NFPROTO_BRIDGE, m->match_size,
+	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO);
+	if (ret < 0) {
+		module_put(match->me);
+		return ret;
+	} else if (match->checkentry != NULL &&
 	    !match->checkentry(name, e, NULL, m->data, hookmask)) {
+		module_put(match->me);
 		BUGPRINT("match->check failed\n");
-		goto out;
+		return -EINVAL;
 	}
+
 	(*cnt)++;
 	return 0;
- out:
-	module_put(match->me);
-	return -EINVAL;
 }
 
 static inline int
 ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
    const char *name, unsigned int hookmask, unsigned int *cnt)
 {
-	struct ebt_watcher *watcher;
+	struct xt_target *watcher;
 	size_t left = ((char *)e + e->target_offset) - (char *)w;
 	int ret;
 
 	if (left < sizeof(struct ebt_entry_watcher) ||
 	   left - sizeof(struct ebt_entry_watcher) < w->watcher_size)
 		return -EINVAL;
-	watcher = find_watcher_lock(w->u.name, &ret, &ebt_mutex);
-	if (!watcher)
-		return ret;
-	w->u.watcher = watcher;
-	if (!try_module_get(watcher->me)) {
-		mutex_unlock(&ebt_mutex);
+
+	watcher = try_then_request_module(
+		  xt_find_target(NFPROTO_BRIDGE, w->u.name, 0),
+		  "ebt_%s", w->u.name);
+	if (IS_ERR(watcher))
+		return PTR_ERR(watcher);
+	if (watcher == NULL)
 		return -ENOENT;
-	}
-	mutex_unlock(&ebt_mutex);
-	if (watcher->family != NFPROTO_BRIDGE) {
-		printk(KERN_WARNING "ebtables: %s watcher: not for ebtables?\n",
-		       watcher->name);
-		goto out;
-	}
-	if (watcher->revision != 0) {
-		printk(KERN_WARNING "ebtables: %s watcher: ebtables is not "
-		       "supporting revisions at this time\n",
-		       watcher->name);
-		goto out;
-	}
-	if (XT_ALIGN(watcher->targetsize) != w->watcher_size) {
-		printk(KERN_WARNING "ebtables: %s watcher: "
-		       "invalid size %Zu != %u\n",
-		       watcher->name, XT_ALIGN(watcher->targetsize),
-		       w->watcher_size);
-		goto out;
-	}
-	if (watcher->checkentry &&
+	w->u.watcher = watcher;
+
+	ret = xt_check_target(watcher, NFPROTO_BRIDGE, w->watcher_size,
+	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO);
+	if (ret < 0) {
+		module_put(watcher->me);
+		return ret;
+	} else if (watcher->checkentry != NULL &&
 	    !watcher->checkentry(name, e, NULL, w->data, hookmask)) {
+		module_put(watcher->me);
 		BUGPRINT("watcher->check failed\n");
-		goto out;
+		return -EINVAL;
 	}
+
 	(*cnt)++;
 	return 0;
- out:
-	module_put(watcher->me);
-	return -EINVAL;
 }
 
 static int ebt_verify_pointers(struct ebt_replace *repl,
@@ -607,7 +577,7 @@ ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i)
 	if (i && (*i)-- == 0)
 		return 1;
 	if (m->u.match->destroy)
-		m->u.match->destroy(NULL, m->data);
+		m->u.match->destroy(m->u.match, m->data);
 	module_put(m->u.match->me);
 
 	return 0;
@@ -619,7 +589,7 @@ ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i)
 	if (i && (*i)-- == 0)
 		return 1;
 	if (w->u.watcher->destroy)
-		w->u.watcher->destroy(NULL, w->data);
+		w->u.watcher->destroy(w->u.watcher, w->data);
 	module_put(w->u.watcher->me);
 
 	return 0;
@@ -639,7 +609,7 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt)
 	EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL);
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
 	if (t->u.target->destroy)
-		t->u.target->destroy(NULL, t->data);
+		t->u.target->destroy(t->u.target, t->data);
 	module_put(t->u.target->me);
 
 	return 0;
@@ -651,7 +621,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
    struct ebt_cl_stack *cl_s, unsigned int udc_cnt)
 {
 	struct ebt_entry_target *t;
-	struct ebt_target *target;
+	struct xt_target *target;
 	unsigned int i, j, hook = 0, hookmask = 0;
 	size_t gap;
 	int ret;
@@ -704,27 +674,15 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 		goto cleanup_watchers;
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
 	gap = e->next_offset - e->target_offset;
-	target = find_target_lock(t->u.name, &ret, &ebt_mutex);
-	if (!target)
-		goto cleanup_watchers;
-	if (!try_module_get(target->me)) {
-		mutex_unlock(&ebt_mutex);
-		ret = -ENOENT;
-		goto cleanup_watchers;
-	}
-	mutex_unlock(&ebt_mutex);
 
-	if (target->family != NFPROTO_BRIDGE) {
-		printk(KERN_WARNING "ebtables: %s target: not for ebtables?\n",
-		       target->name);
-		ret = -EINVAL;
+	target = try_then_request_module(
+		 xt_find_target(NFPROTO_BRIDGE, t->u.name, 0),
+		 "ebt_%s", t->u.name);
+	if (IS_ERR(target)) {
+		ret = PTR_ERR(target);
 		goto cleanup_watchers;
-	}
-	if (target->revision != 0) {
-		printk(KERN_WARNING "ebtables: %s target: ebtables is not "
-		       "supporting revisions at this time\n",
-		       target->name);
-		ret = -EINVAL;
+	} else if (target == NULL) {
+		ret = -ENOENT;
 		goto cleanup_watchers;
 	}
 
@@ -745,13 +703,12 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 		module_put(t->u.target->me);
 		ret = -EFAULT;
 		goto cleanup_watchers;
-	} else if (XT_ALIGN(target->targetsize) != t->target_size) {
-		printk(KERN_WARNING "ebtables: %s target: "
-		       "invalid size %Zu != %u\n",
-		       target->name, XT_ALIGN(target->targetsize),
-		       t->target_size);
-		module_put(t->u.target->me);
-		ret = -EINVAL;
+	}
+
+	ret = xt_check_target(target, NFPROTO_BRIDGE, t->target_size,
+	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO);
+	if (ret < 0) {
+		module_put(target->me);
 		goto cleanup_watchers;
 	} else if (t->u.target->checkentry &&
 	    !t->u.target->checkentry(name, e, NULL, t->data, hookmask)) {
@@ -1589,11 +1546,14 @@ static int __init ebtables_init(void)
 {
 	int ret;
 
-	mutex_lock(&ebt_mutex);
-	list_add(&ebt_standard_target.list, &ebt_targets);
-	mutex_unlock(&ebt_mutex);
-	if ((ret = nf_register_sockopt(&ebt_sockopts)) < 0)
+	ret = xt_register_target(&ebt_standard_target);
+	if (ret < 0)
 		return ret;
+	ret = nf_register_sockopt(&ebt_sockopts);
+	if (ret < 0) {
+		xt_unregister_target(&ebt_standard_target);
+		return ret;
+	}
 
 	printk(KERN_INFO "Ebtables v2.0 registered\n");
 	return 0;
@@ -1602,6 +1562,7 @@ static int __init ebtables_init(void)
 static void __exit ebtables_fini(void)
 {
 	nf_unregister_sockopt(&ebt_sockopts);
+	xt_unregister_target(&ebt_standard_target);
 	printk(KERN_INFO "Ebtables v2.0 unregistered\n");
 }
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index aece6c2d134..0e23f42e341 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -30,7 +30,7 @@
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("[ip,ip6,arp]_tables backend module");
+MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
 
 #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
 
@@ -325,7 +325,12 @@ int xt_check_match(const struct xt_match *match, unsigned short family,
 		   unsigned int size, const char *table, unsigned int hook_mask,
 		   unsigned short proto, int inv_proto)
 {
-	if (XT_ALIGN(match->matchsize) != size) {
+	if (XT_ALIGN(match->matchsize) != size &&
+	    match->matchsize != -1) {
+		/*
+		 * ebt_among is exempt from centralized matchsize checking
+		 * because it uses a dynamic-size data set.
+		 */
 		printk("%s_tables: %s match: invalid size %Zu != %u\n",
 		       xt_prefix[family], match->name,
 		       XT_ALIGN(match->matchsize), size);
-- 
cgit v1.2.3


From 102befab75c438bfa356c6976026326728771ebc Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:15 +0200
Subject: netfilter: x_tables: output bad hook mask in hexadecimal

It is a mask, and masks are most useful in hex.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/x_tables.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0e23f42e341..3b1fc40cc27 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -342,7 +342,7 @@ int xt_check_match(const struct xt_match *match, unsigned short family,
 		return -EINVAL;
 	}
 	if (match->hooks && (hook_mask & ~match->hooks) != 0) {
-		printk("%s_tables: %s match: bad hook_mask %u/%u\n",
+		printk("%s_tables: %s match: bad hook_mask %#x/%#x\n",
 		       xt_prefix[family], match->name, hook_mask, match->hooks);
 		return -EINVAL;
 	}
@@ -483,7 +483,7 @@ int xt_check_target(const struct xt_target *target, unsigned short family,
 		return -EINVAL;
 	}
 	if (target->hooks && (hook_mask & ~target->hooks) != 0) {
-		printk("%s_tables: %s target: bad hook_mask %u/%u\n",
+		printk("%s_tables: %s target: bad hook_mask %#x/%#x\n",
 		       xt_prefix[family], target->name, hook_mask,
 		       target->hooks);
 		return -EINVAL;
-- 
cgit v1.2.3


From f2ff525c8dae57b3cda51d76443f60f764f34202 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:15 +0200
Subject: netfilter: ebtables: use generic table checking

Ebtables ORs (1 << NF_BR_NUMHOOKS) into the hook mask to indicate that
the extension was called from a base chain. So this also needs to be
present in the extensions' ->hooks.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_arpreply.c | 4 ++--
 net/bridge/netfilter/ebt_dnat.c     | 2 ++
 net/bridge/netfilter/ebt_redirect.c | 2 ++
 net/bridge/netfilter/ebt_snat.c     | 6 ++----
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 8071b64af46..0e51c8d7e5f 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -73,8 +73,6 @@ ebt_arpreply_tg_check(const char *tablename, const void *entry,
 	    e->invflags & EBT_IPROTO)
 		return false;
 	CLEAR_BASE_CHAIN_BIT;
-	if (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING))
-		return false;
 	return true;
 }
 
@@ -82,6 +80,8 @@ static struct xt_target ebt_arpreply_tg_reg __read_mostly = {
 	.name		= "arpreply",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
+	.table		= "nat",
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING),
 	.target		= ebt_arpreply_tg,
 	.checkentry	= ebt_arpreply_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_arpreply_info)),
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index d2211c4a477..cb80101e412 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -51,6 +51,8 @@ static struct xt_target ebt_dnat_tg_reg __read_mostly = {
 	.name		= "dnat",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+			  (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_BROUTING),
 	.target		= ebt_dnat_tg,
 	.checkentry	= ebt_dnat_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 1b7684ffe40..a50ffbe0e4f 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -56,6 +56,8 @@ static struct xt_target ebt_redirect_tg_reg __read_mostly = {
 	.name		= "redirect",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+			  (1 << NF_BR_BROUTING),
 	.target		= ebt_redirect_tg,
 	.checkentry	= ebt_redirect_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_redirect_info)),
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index c90217a4f9e..8a55c7d49b5 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -56,10 +56,6 @@ ebt_snat_tg_check(const char *tablename, const void *e,
 	if (BASE_CHAIN && tmp == EBT_RETURN)
 		return false;
 	CLEAR_BASE_CHAIN_BIT;
-	if (strcmp(tablename, "nat"))
-		return false;
-	if (hookmask & ~(1 << NF_BR_POST_ROUTING))
-		return false;
 
 	if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
 		return false;
@@ -73,6 +69,8 @@ static struct xt_target ebt_snat_tg_reg __read_mostly = {
 	.name		= "snat",
 	.revision	= 0,
 	.family		= NFPROTO_BRIDGE,
+	.table		= "nat",
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_POST_ROUTING),
 	.target		= ebt_snat_tg,
 	.checkentry	= ebt_snat_tg_check,
 	.targetsize	= XT_ALIGN(sizeof(struct ebt_nat_info)),
-- 
cgit v1.2.3


From 5365f8022e04310f0276c95e82548da917d514db Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:16 +0200
Subject: netfilter: implement hotdrop for Ebtables

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebtables.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index c4f7a2e8ed3..7964d3f0388 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -78,9 +78,10 @@ static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
 
 static inline int ebt_do_match (struct ebt_entry_match *m,
    const struct sk_buff *skb, const struct net_device *in,
-   const struct net_device *out)
+   const struct net_device *out, bool *hotdrop)
 {
-	return m->u.match->match(skb, in, out, m->u.match, m->data, 0, 0, NULL);
+	return m->u.match->match(skb, in, out, m->u.match,
+	       m->data, 0, 0, hotdrop);
 }
 
 static inline int ebt_dev_check(char *entry, const struct net_device *device)
@@ -156,6 +157,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 	struct ebt_entries *chaininfo;
 	char *base;
 	struct ebt_table_info *private;
+	bool hotdrop = false;
 
 	read_lock_bh(&table->lock);
 	private = table->private;
@@ -176,8 +178,13 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 		if (ebt_basic_match(point, eth_hdr(skb), in, out))
 			goto letscontinue;
 
-		if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, in, out) != 0)
+		if (EBT_MATCH_ITERATE(point, ebt_do_match, skb,
+		    in, out, &hotdrop) != 0)
 			goto letscontinue;
+		if (hotdrop) {
+			read_unlock_bh(&table->lock);
+			return NF_DROP;
+		}
 
 		/* increase counter */
 		(*(counter_base + i)).pcnt++;
-- 
cgit v1.2.3


From 66bff35b722956cc2423f55fcf1b69cefa24ef8b Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:16 +0200
Subject: netfilter: remove unused Ebtables functions

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebtables.c | 108 ----------------------------------------
 1 file changed, 108 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 7964d3f0388..b489ed262fa 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -56,9 +56,6 @@
 
 static DEFINE_MUTEX(ebt_mutex);
 static LIST_HEAD(ebt_tables);
-static LIST_HEAD(ebt_targets);
-static LIST_HEAD(ebt_matches);
-static LIST_HEAD(ebt_watchers);
 
 static struct xt_target ebt_standard_target = {
 	.name       = "standard",
@@ -322,24 +319,6 @@ find_table_lock(const char *name, int *error, struct mutex *mutex)
 	return find_inlist_lock(&ebt_tables, name, "ebtable_", error, mutex);
 }
 
-static inline struct ebt_match *
-find_match_lock(const char *name, int *error, struct mutex *mutex)
-{
-	return find_inlist_lock(&ebt_matches, name, "ebt_", error, mutex);
-}
-
-static inline struct ebt_watcher *
-find_watcher_lock(const char *name, int *error, struct mutex *mutex)
-{
-	return find_inlist_lock(&ebt_watchers, name, "ebt_", error, mutex);
-}
-
-static inline struct ebt_target *
-find_target_lock(const char *name, int *error, struct mutex *mutex)
-{
-	return find_inlist_lock(&ebt_targets, name, "ebt_", error, mutex);
-}
-
 static inline int
 ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
    const char *name, unsigned int hookmask, unsigned int *cnt)
@@ -1103,87 +1082,6 @@ free_newinfo:
 	return ret;
 }
 
-int ebt_register_target(struct ebt_target *target)
-{
-	struct ebt_target *t;
-	int ret;
-
-	ret = mutex_lock_interruptible(&ebt_mutex);
-	if (ret != 0)
-		return ret;
-	list_for_each_entry(t, &ebt_targets, list) {
-		if (strcmp(t->name, target->name) == 0) {
-			mutex_unlock(&ebt_mutex);
-			return -EEXIST;
-		}
-	}
-	list_add(&target->list, &ebt_targets);
-	mutex_unlock(&ebt_mutex);
-
-	return 0;
-}
-
-void ebt_unregister_target(struct ebt_target *target)
-{
-	mutex_lock(&ebt_mutex);
-	list_del(&target->list);
-	mutex_unlock(&ebt_mutex);
-}
-
-int ebt_register_match(struct ebt_match *match)
-{
-	struct ebt_match *m;
-	int ret;
-
-	ret = mutex_lock_interruptible(&ebt_mutex);
-	if (ret != 0)
-		return ret;
-	list_for_each_entry(m, &ebt_matches, list) {
-		if (strcmp(m->name, match->name) == 0) {
-			mutex_unlock(&ebt_mutex);
-			return -EEXIST;
-		}
-	}
-	list_add(&match->list, &ebt_matches);
-	mutex_unlock(&ebt_mutex);
-
-	return 0;
-}
-
-void ebt_unregister_match(struct ebt_match *match)
-{
-	mutex_lock(&ebt_mutex);
-	list_del(&match->list);
-	mutex_unlock(&ebt_mutex);
-}
-
-int ebt_register_watcher(struct ebt_watcher *watcher)
-{
-	struct ebt_watcher *w;
-	int ret;
-
-	ret = mutex_lock_interruptible(&ebt_mutex);
-	if (ret != 0)
-		return ret;
-	list_for_each_entry(w, &ebt_watchers, list) {
-		if (strcmp(w->name, watcher->name) == 0) {
-			mutex_unlock(&ebt_mutex);
-			return -EEXIST;
-		}
-	}
-	list_add(&watcher->list, &ebt_watchers);
-	mutex_unlock(&ebt_mutex);
-
-	return 0;
-}
-
-void ebt_unregister_watcher(struct ebt_watcher *watcher)
-{
-	mutex_lock(&ebt_mutex);
-	list_del(&watcher->list);
-	mutex_unlock(&ebt_mutex);
-}
-
 int ebt_register_table(struct ebt_table *table)
 {
 	struct ebt_table_info *newinfo;
@@ -1575,12 +1473,6 @@ static void __exit ebtables_fini(void)
 
 EXPORT_SYMBOL(ebt_register_table);
 EXPORT_SYMBOL(ebt_unregister_table);
-EXPORT_SYMBOL(ebt_register_match);
-EXPORT_SYMBOL(ebt_unregister_match);
-EXPORT_SYMBOL(ebt_register_watcher);
-EXPORT_SYMBOL(ebt_unregister_watcher);
-EXPORT_SYMBOL(ebt_register_target);
-EXPORT_SYMBOL(ebt_unregister_target);
 EXPORT_SYMBOL(ebt_do_table);
 module_init(ebtables_init);
 module_exit(ebtables_fini);
-- 
cgit v1.2.3


From f7277f8d3aa4d3f99a9bdb48b27a2344a637a4b2 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:16 +0200
Subject: netfilter: remove redundant casts from Ebtables

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_arpreply.c | 2 +-
 net/bridge/netfilter/ebt_ip6.c      | 4 ++--
 net/bridge/netfilter/ebt_limit.c    | 2 +-
 net/bridge/netfilter/ebt_nflog.c    | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 0e51c8d7e5f..baf5510d044 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -20,7 +20,7 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct net_device *in,
 		const struct net_device *out, unsigned int hook_nr,
 		const struct xt_target *target, const void *data)
 {
-	struct ebt_arpreply_info *info = (void *)data;
+	const struct ebt_arpreply_info *info = data;
 	const __be32 *siptr, *diptr;
 	__be32 _sip, _dip;
 	const struct arphdr *ap;
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 317e624ae59..7bd98312967 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -32,7 +32,7 @@ ebt_ip6_mt(const struct sk_buff *skb, const struct net_device *in,
 	   const struct net_device *out, const struct xt_match *match,
 	   const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
-	const struct ebt_ip6_info *info = (struct ebt_ip6_info *)data;
+	const struct ebt_ip6_info *info = data;
 	const struct ipv6hdr *ih6;
 	struct ipv6hdr _ip6h;
 	const struct tcpudphdr *pptr;
@@ -98,7 +98,7 @@ ebt_ip6_mt_check(const char *table, const void *entry,
 		 unsigned int hook_mask)
 {
 	const struct ebt_entry *e = entry;
-	struct ebt_ip6_info *info = (struct ebt_ip6_info *)data;
+	struct ebt_ip6_info *info = data;
 
 	if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
 		return false;
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 43d9a500363..58aaaa14906 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -35,7 +35,7 @@ ebt_limit_mt(const struct sk_buff *skb, const struct net_device *in,
 	     const struct net_device *out, const struct xt_match *match,
 	     const void *data, int offset, unsigned int protoff, bool *hotdrop)
 {
-	struct ebt_limit_info *info = (struct ebt_limit_info *)data;
+	struct ebt_limit_info *info = (void *)data;
 	unsigned long now = jiffies;
 
 	spin_lock_bh(&limit_lock);
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 917ac360079..74b4fa0aabc 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -24,7 +24,7 @@ ebt_nflog_tg(struct sk_buff *skb, const struct net_device *in,
 	     const struct net_device *out, unsigned int hooknr,
 	     const struct xt_target *target, const void *data)
 {
-	struct ebt_nflog_info *info = (struct ebt_nflog_info *)data;
+	const struct ebt_nflog_info *info = data;
 	struct nf_loginfo li;
 
 	li.type = NF_LOG_TYPE_ULOG;
@@ -41,7 +41,7 @@ ebt_nflog_tg_check(const char *table, const void *e,
 		   const struct xt_target *target, void *data,
 		   unsigned int hookmask)
 {
-	struct ebt_nflog_info *info = (struct ebt_nflog_info *)data;
+	struct ebt_nflog_info *info = data;
 
 	if (info->flags & ~EBT_NFLOG_MASK)
 		return false;
-- 
cgit v1.2.3


From 147c3844ad381b58715a6ee2ea697594e3c06284 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:16 +0200
Subject: netfilter: ebtables: fix one wrong return value

Usually -EINVAL is used when checkentry fails (see *_tables).

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebtables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index b489ed262fa..7d8ead52d25 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -699,7 +699,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	} else if (t->u.target->checkentry &&
 	    !t->u.target->checkentry(name, e, NULL, t->data, hookmask)) {
 		module_put(t->u.target->me);
-		ret = -EFAULT;
+		ret = -EINVAL;
 		goto cleanup_watchers;
 	}
 	(*cnt)++;
-- 
cgit v1.2.3


From 367c679007fa4f990eb7ee381326ec59d8148b0e Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:17 +0200
Subject: netfilter: xtables: do centralized checkentry call (1/2)

It used to be that {ip,ip6,etc}_tables called extension->checkentry
themselves, but this can be moved into the xtables core.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebtables.c | 24 ++++++------------------
 net/ipv4/netfilter/arp_tables.c | 10 ++++------
 net/ipv4/netfilter/ip_tables.c  | 23 +++++++++--------------
 net/ipv6/netfilter/ip6_tables.c | 23 +++++++++--------------
 net/netfilter/x_tables.c        | 12 ++++++++++--
 net/sched/act_ipt.c             | 14 +++-----------
 6 files changed, 41 insertions(+), 65 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 7d8ead52d25..7ee72b71d3c 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -340,15 +340,11 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 	m->u.match = match;
 
 	ret = xt_check_match(match, NFPROTO_BRIDGE, m->match_size,
-	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO);
+	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO,
+	      e, m->data);
 	if (ret < 0) {
 		module_put(match->me);
 		return ret;
-	} else if (match->checkentry != NULL &&
-	    !match->checkentry(name, e, NULL, m->data, hookmask)) {
-		module_put(match->me);
-		BUGPRINT("match->check failed\n");
-		return -EINVAL;
 	}
 
 	(*cnt)++;
@@ -377,15 +373,11 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
 	w->u.watcher = watcher;
 
 	ret = xt_check_target(watcher, NFPROTO_BRIDGE, w->watcher_size,
-	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO);
+	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO,
+	      e, w->data);
 	if (ret < 0) {
 		module_put(watcher->me);
 		return ret;
-	} else if (watcher->checkentry != NULL &&
-	    !watcher->checkentry(name, e, NULL, w->data, hookmask)) {
-		module_put(watcher->me);
-		BUGPRINT("watcher->check failed\n");
-		return -EINVAL;
 	}
 
 	(*cnt)++;
@@ -692,15 +684,11 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	}
 
 	ret = xt_check_target(target, NFPROTO_BRIDGE, t->target_size,
-	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO);
+	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO,
+	      e, t->data);
 	if (ret < 0) {
 		module_put(target->me);
 		goto cleanup_watchers;
-	} else if (t->u.target->checkentry &&
-	    !t->u.target->checkentry(name, e, NULL, t->data, hookmask)) {
-		module_put(t->u.target->me);
-		ret = -EINVAL;
-		goto cleanup_watchers;
 	}
 	(*cnt)++;
 	return 0;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b4a9a1799c9..ae525a9afbe 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -465,15 +465,13 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 
 	ret = xt_check_target(target, NFPROTO_ARP,
 			      t->u.target_size - sizeof(*t),
-			      name, e->comefrom, 0, 0);
-	if (!ret && t->u.kernel.target->checkentry
-	    && !t->u.kernel.target->checkentry(name, e, target, t->data,
-					       e->comefrom)) {
+			      name, e->comefrom, 0, 0, e, t->data);
+	if (ret < 0) {
 		duprintf("arp_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
-		ret = -EINVAL;
+		return ret;
 	}
-	return ret;
+	return 0;
 }
 
 static inline int
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4e7c719445c..b4c74a7a807 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -616,17 +616,14 @@ check_match(struct ipt_entry_match *m, const char *name,
 	match = m->u.kernel.match;
 	ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m),
 			     name, hookmask, ip->proto,
-			     ip->invflags & IPT_INV_PROTO);
-	if (!ret && m->u.kernel.match->checkentry
-	    && !m->u.kernel.match->checkentry(name, ip, match, m->data,
-					      hookmask)) {
+			     ip->invflags & IPT_INV_PROTO, ip, m->data);
+	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 m->u.kernel.match->name);
-		ret = -EINVAL;
+		return ret;
 	}
-	if (!ret)
-		(*i)++;
-	return ret;
+	++*i;
+	return 0;
 }
 
 static int
@@ -668,15 +665,13 @@ static int check_target(struct ipt_entry *e, const char *name)
 	target = t->u.kernel.target;
 	ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
 			      name, e->comefrom, e->ip.proto,
-			      e->ip.invflags & IPT_INV_PROTO);
-	if (!ret && t->u.kernel.target->checkentry
-	    && !t->u.kernel.target->checkentry(name, e, target, t->data,
-					       e->comefrom)) {
+			      e->ip.invflags & IPT_INV_PROTO, e, t->data);
+	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
-		ret = -EINVAL;
+		return ret;
 	}
-	return ret;
+	return 0;
 }
 
 static int
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 0b4557e0343..12c41b8d365 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -642,17 +642,14 @@ static int check_match(struct ip6t_entry_match *m, const char *name,
 	match = m->u.kernel.match;
 	ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m),
 			     name, hookmask, ipv6->proto,
-			     ipv6->invflags & IP6T_INV_PROTO);
-	if (!ret && m->u.kernel.match->checkentry
-	    && !m->u.kernel.match->checkentry(name, ipv6, match, m->data,
-					      hookmask)) {
+			     ipv6->invflags & IP6T_INV_PROTO, ipv6, m->data);
+	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 m->u.kernel.match->name);
-		ret = -EINVAL;
+		return ret;
 	}
-	if (!ret)
-		(*i)++;
-	return ret;
+	++*i;
+	return 0;
 }
 
 static int
@@ -694,15 +691,13 @@ static int check_target(struct ip6t_entry *e, const char *name)
 	target = t->u.kernel.target;
 	ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t),
 			      name, e->comefrom, e->ipv6.proto,
-			      e->ipv6.invflags & IP6T_INV_PROTO);
-	if (!ret && t->u.kernel.target->checkentry
-	    && !t->u.kernel.target->checkentry(name, e, target, t->data,
-					       e->comefrom)) {
+			      e->ipv6.invflags & IP6T_INV_PROTO, e, t->data);
+	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
-		ret = -EINVAL;
+		return ret;
 	}
-	return ret;
+	return 0;
 }
 
 static int
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 3b1fc40cc27..d1f2fb3e8f2 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -323,7 +323,8 @@ EXPORT_SYMBOL_GPL(xt_find_revision);
 
 int xt_check_match(const struct xt_match *match, unsigned short family,
 		   unsigned int size, const char *table, unsigned int hook_mask,
-		   unsigned short proto, int inv_proto)
+		   unsigned short proto, int inv_proto, const void *entry,
+		   void *matchinfo)
 {
 	if (XT_ALIGN(match->matchsize) != size &&
 	    match->matchsize != -1) {
@@ -351,6 +352,9 @@ int xt_check_match(const struct xt_match *match, unsigned short family,
 		       xt_prefix[family], match->name, match->proto);
 		return -EINVAL;
 	}
+	if (match->checkentry != NULL &&
+	    !match->checkentry(table, entry, match, matchinfo, hook_mask))
+		return -EINVAL;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_check_match);
@@ -469,7 +473,8 @@ EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
 
 int xt_check_target(const struct xt_target *target, unsigned short family,
 		    unsigned int size, const char *table, unsigned int hook_mask,
-		    unsigned short proto, int inv_proto)
+		    unsigned short proto, int inv_proto, const void *entry,
+		    void *targinfo)
 {
 	if (XT_ALIGN(target->targetsize) != size) {
 		printk("%s_tables: %s target: invalid size %Zu != %u\n",
@@ -493,6 +498,9 @@ int xt_check_target(const struct xt_target *target, unsigned short family,
 		       xt_prefix[family], target->name, target->proto);
 		return -EINVAL;
 	}
+	if (target->checkentry != NULL &&
+	    !target->checkentry(table, entry, target, targinfo, hook_mask))
+		return -EINVAL;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_check_target);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d1263b3c96c..79ea19375ca 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -51,20 +51,12 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 	t->u.kernel.target = target;
 
 	ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
-			      table, hook, 0, 0);
-	if (ret) {
+			      table, hook, 0, 0, NULL, t->data);
+	if (ret < 0) {
 		module_put(t->u.kernel.target->me);
 		return ret;
 	}
-	if (t->u.kernel.target->checkentry
-	    && !t->u.kernel.target->checkentry(table, NULL,
-					       t->u.kernel.target, t->data,
-					       hook)) {
-		module_put(t->u.kernel.target->me);
-		ret = -EINVAL;
-	}
-
-	return ret;
+	return 0;
 }
 
 static void ipt_destroy_target(struct ipt_entry_target *t)
-- 
cgit v1.2.3


From 77d7358995489bf354fb4f65f4528e47980ffb08 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:17 +0200
Subject: netfilter: ip6tables: fix name of hopbyhop in Kconfig

The module is called hbh, not hopbyhop.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 0cfcce7b18d..f24432462be 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -67,7 +67,7 @@ config IP6_NF_MATCH_RT
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config IP6_NF_MATCH_OPTS
-	tristate '"hopbyhop" and "dst" opts header match support'
+	tristate '"hbh" hop-by-hop and "dst" opts header match support'
 	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
-- 
cgit v1.2.3


From 2203eb47603b01b15a4b1d5b1c7886da96158e74 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:17 +0200
Subject: netfilter: ip6tables: fix Kconfig entry dependency for ip6t_LOG

ip6t_LOG does certainly not depend on the filter table.
(Also, move it so that menuconfig still displays it correctly.)

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/Kconfig | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index f24432462be..fee881bd31f 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -136,24 +136,24 @@ config IP6_NF_MATCH_EUI64
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 # The targets
-config IP6_NF_FILTER
-	tristate "Packet filtering"
+config IP6_NF_TARGET_LOG
+	tristate "LOG target support"
 	depends on IP6_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
-	  Packet filtering defines a table `filter', which has a series of
-	  rules for simple packet filtering at local input, forwarding and
-	  local output.  See the man page for iptables(8).
+	  This option adds a `LOG' target, which allows you to create rules in
+	  any iptables table which records the packet header to the syslog.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_TARGET_LOG
-	tristate "LOG target support"
-	depends on IP6_NF_FILTER
+config IP6_NF_FILTER
+	tristate "Packet filtering"
+	depends on IP6_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
-	  This option adds a `LOG' target, which allows you to create rules in
-	  any iptables table which records the packet header to the syslog.
+	  Packet filtering defines a table `filter', which has a series of
+	  rules for simple packet filtering at local input, forwarding and
+	  local output.  See the man page for iptables(8).
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-- 
cgit v1.2.3


From 20f3c56f4d7c76bcb66050f3364aa8da110f5bbd Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:17 +0200
Subject: netfilter: ebtables: make BRIDGE_NF_EBTABLES a menuconfig option

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/Kconfig | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index e7c197ffb2f..366d3e9d51f 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -2,22 +2,21 @@
 # Bridge netfilter configuration
 #
 
-menu "Bridge: Netfilter Configuration"
-	depends on BRIDGE && BRIDGE_NETFILTER
-
-config BRIDGE_NF_EBTABLES
+menuconfig BRIDGE_NF_EBTABLES
 	tristate "Ethernet Bridge tables (ebtables) support"
 	select NETFILTER_XTABLES
 	help
 	  ebtables is a general, extensible frame/packet identification
 	  framework. Say 'Y' or 'M' here if you want to do Ethernet
 	  filtering/NAT/brouting on the Ethernet bridge.
+
+if BRIDGE_NF_EBTABLES
+
 #
 # tables
 #
 config BRIDGE_EBT_BROUTE
 	tristate "ebt: broute table support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  The ebtables broute table is used to define rules that decide between
 	  bridging and routing frames, giving Linux the functionality of a
@@ -28,7 +27,6 @@ config BRIDGE_EBT_BROUTE
 
 config BRIDGE_EBT_T_FILTER
 	tristate "ebt: filter table support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  The ebtables filter table is used to define frame filtering rules at
 	  local input, forwarding and local output. See the man page for
@@ -38,7 +36,6 @@ config BRIDGE_EBT_T_FILTER
 
 config BRIDGE_EBT_T_NAT
 	tristate "ebt: nat table support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  The ebtables nat table is used to define rules that alter the MAC
 	  source address (MAC SNAT) or the MAC destination address (MAC DNAT).
@@ -50,7 +47,6 @@ config BRIDGE_EBT_T_NAT
 #
 config BRIDGE_EBT_802_3
 	tristate "ebt: 802.3 filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds matching support for 802.3 Ethernet frames.
 
@@ -58,7 +54,6 @@ config BRIDGE_EBT_802_3
 
 config BRIDGE_EBT_AMONG
 	tristate "ebt: among filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the among match, which allows matching the MAC source
 	  and/or destination address on a list of addresses. Optionally,
@@ -68,7 +63,6 @@ config BRIDGE_EBT_AMONG
 
 config BRIDGE_EBT_ARP
 	tristate "ebt: ARP filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the ARP match, which allows ARP and RARP header field
 	  filtering.
@@ -77,7 +71,6 @@ config BRIDGE_EBT_ARP
 
 config BRIDGE_EBT_IP
 	tristate "ebt: IP filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the IP match, which allows basic IP header field
 	  filtering.
@@ -95,7 +88,6 @@ config BRIDGE_EBT_IP6
 
 config BRIDGE_EBT_LIMIT
 	tristate "ebt: limit match support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the limit match, which allows you to control
 	  the rate at which a rule can be matched. This match is the
@@ -106,7 +98,6 @@ config BRIDGE_EBT_LIMIT
 
 config BRIDGE_EBT_MARK
 	tristate "ebt: mark filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the mark match, which allows matching frames based on
 	  the 'nfmark' value in the frame. This can be set by the mark target.
@@ -117,7 +108,6 @@ config BRIDGE_EBT_MARK
 
 config BRIDGE_EBT_PKTTYPE
 	tristate "ebt: packet type filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the packet type match, which allows matching on the
 	  type of packet based on its Ethernet "class" (as determined by
@@ -128,7 +118,6 @@ config BRIDGE_EBT_PKTTYPE
 
 config BRIDGE_EBT_STP
 	tristate "ebt: STP filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the Spanning Tree Protocol match, which
 	  allows STP header field filtering.
@@ -137,7 +126,6 @@ config BRIDGE_EBT_STP
 
 config BRIDGE_EBT_VLAN
 	tristate "ebt: 802.1Q VLAN filter support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the 802.1Q vlan match, which allows the filtering of
 	  802.1Q vlan fields.
@@ -157,7 +145,6 @@ config BRIDGE_EBT_ARPREPLY
 
 config BRIDGE_EBT_DNAT
 	tristate "ebt: dnat target support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the MAC DNAT target, which allows altering the MAC
 	  destination address of frames.
@@ -166,7 +153,6 @@ config BRIDGE_EBT_DNAT
 
 config BRIDGE_EBT_MARK_T
 	tristate "ebt: mark target support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the mark target, which allows marking frames by
 	  setting the 'nfmark' value in the frame.
@@ -177,7 +163,6 @@ config BRIDGE_EBT_MARK_T
 
 config BRIDGE_EBT_REDIRECT
 	tristate "ebt: redirect target support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the MAC redirect target, which allows altering the MAC
 	  destination address of a frame to that of the device it arrived on.
@@ -186,7 +171,6 @@ config BRIDGE_EBT_REDIRECT
 
 config BRIDGE_EBT_SNAT
 	tristate "ebt: snat target support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the MAC SNAT target, which allows altering the MAC
 	  source address of frames.
@@ -197,7 +181,6 @@ config BRIDGE_EBT_SNAT
 #
 config BRIDGE_EBT_LOG
 	tristate "ebt: log support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option adds the log watcher, that you can use in any rule
 	  in any ebtables table. It records info about the frame header
@@ -207,7 +190,6 @@ config BRIDGE_EBT_LOG
 
 config BRIDGE_EBT_ULOG
 	tristate "ebt: ulog support (OBSOLETE)"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option enables the old bridge-specific "ebt_ulog" implementation
 	  which has been obsoleted by the new "nfnetlink_log" code (see
@@ -224,7 +206,6 @@ config BRIDGE_EBT_ULOG
 
 config BRIDGE_EBT_NFLOG
 	tristate "ebt: nflog support"
-	depends on BRIDGE_NF_EBTABLES
 	help
 	  This option enables the nflog watcher, which allows to LOG
 	  messages through the netfilter logging API, which can use
@@ -236,4 +217,4 @@ config BRIDGE_EBT_NFLOG
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-endmenu
+endif # BRIDGE_NF_EBTABLES
-- 
cgit v1.2.3


From aba0d34800d7f56493b4d5548cc06498a4d69124 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:17 +0200
Subject: netfilter: xtables: sort extensions alphabetically in Kconfig

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig |  78 +++++++++++++++++------------------
 net/ipv6/netfilter/Kconfig |  44 ++++++++++----------
 net/netfilter/Kconfig      | 100 ++++++++++++++++++++++-----------------------
 3 files changed, 111 insertions(+), 111 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 07757ac8d5d..087b8290684 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -62,15 +62,16 @@ config IP_NF_IPTABLES
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 # The matches.
-config IP_NF_MATCH_ECN
-	tristate '"ecn" match support'
+config IP_NF_MATCH_ADDRTYPE
+	tristate '"addrtype" address type match support'
 	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
-	  This option adds a `ECN' match, which allows you to match against
-	  the IPv4 and TCP header ECN fields.
+	  This option allows you to match what routing thinks of an address,
+	  eg. UNICAST, LOCAL, BROADCAST, ...
 
-	  To compile it as a module, choose M here.  If unsure, say N.
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
 
 config IP_NF_MATCH_AH
 	tristate '"ah" match support'
@@ -82,26 +83,25 @@ config IP_NF_MATCH_AH
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_MATCH_TTL
-	tristate '"ttl" match support'
+config IP_NF_MATCH_ECN
+	tristate '"ecn" match support'
 	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
-	  This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
-	  to match packets by their TTL value.
+	  This option adds a `ECN' match, which allows you to match against
+	  the IPv4 and TCP header ECN fields.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_MATCH_ADDRTYPE
-	tristate '"addrtype" address type match support'
+config IP_NF_MATCH_TTL
+	tristate '"ttl" match support'
 	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
-	  This option allows you to match what routing thinks of an address,
-	  eg. UNICAST, LOCAL, BROADCAST, ...
+	  This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
+	  to match packets by their TTL value.
 
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+	  To compile it as a module, choose M here.  If unsure, say N.
 
 # `filter', generic and specific targets
 config IP_NF_FILTER
@@ -186,26 +186,26 @@ config IP_NF_TARGET_MASQUERADE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_TARGET_REDIRECT
-	tristate "REDIRECT target support"
+config IP_NF_TARGET_NETMAP
+	tristate "NETMAP target support"
 	depends on NF_NAT
 	depends on NETFILTER_ADVANCED
 	help
-	  REDIRECT is a special case of NAT: all incoming connections are
-	  mapped onto the incoming interface's address, causing the packets to
-	  come to the local machine instead of passing through.  This is
-	  useful for transparent proxies.
+	  NETMAP is an implementation of static 1:1 NAT mapping of network
+	  addresses. It maps the network address part, while keeping the host
+	  address part intact.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_TARGET_NETMAP
-	tristate "NETMAP target support"
+config IP_NF_TARGET_REDIRECT
+	tristate "REDIRECT target support"
 	depends on NF_NAT
 	depends on NETFILTER_ADVANCED
 	help
-	  NETMAP is an implementation of static 1:1 NAT mapping of network
-	  addresses. It maps the network address part, while keeping the host
-	  address part intact.
+	  REDIRECT is a special case of NAT: all incoming connections are
+	  mapped onto the incoming interface's address, causing the packets to
+	  come to the local machine instead of passing through.  This is
+	  useful for transparent proxies.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
@@ -300,6 +300,19 @@ config IP_NF_MANGLE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_TARGET_CLUSTERIP
+	tristate "CLUSTERIP target support (EXPERIMENTAL)"
+	depends on IP_NF_MANGLE && EXPERIMENTAL
+	depends on NF_CONNTRACK_IPV4
+	depends on NETFILTER_ADVANCED
+	select NF_CONNTRACK_MARK
+	help
+	  The CLUSTERIP target allows you to build load-balancing clusters of
+	  network servers without having a dedicated load-balancing
+	  router/server/switch.
+	
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config IP_NF_TARGET_ECN
 	tristate "ECN target support"
 	depends on IP_NF_MANGLE
@@ -330,19 +343,6 @@ config IP_NF_TARGET_TTL
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_TARGET_CLUSTERIP
-	tristate "CLUSTERIP target support (EXPERIMENTAL)"
-	depends on IP_NF_MANGLE && EXPERIMENTAL
-	depends on NF_CONNTRACK_IPV4
-	depends on NETFILTER_ADVANCED
-	select NF_CONNTRACK_MARK
-	help
-	  The CLUSTERIP target allows you to build load-balancing clusters of
-	  network servers without having a dedicated load-balancing
-	  router/server/switch.
-	
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 # raw + specific targets
 config IP_NF_RAW
 	tristate  'raw table support (required for NOTRACK/TRACE)'
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index fee881bd31f..91ffba08c29 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -56,23 +56,23 @@ config IP6_NF_IPTABLES
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 # The simple matches.
-config IP6_NF_MATCH_RT
-	tristate '"rt" Routing header match support'
+config IP6_NF_MATCH_AH
+	tristate '"ah" match support'
 	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
-	  rt matching allows you to match packets based on the routing
-	  header of the packet.
+	  This module allows one to match AH packets.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_MATCH_OPTS
-	tristate '"hbh" hop-by-hop and "dst" opts header match support'
+config IP6_NF_MATCH_EUI64
+	tristate '"eui64" address check'
 	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
-	  This allows one to match packets based on the hop-by-hop
-	  and destination options headers of a packet.
+	  This module performs checking on the IPv6 source address
+	  Compares the last 64 bits with the EUI64 (delivered
+	  from the MAC address) address
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
@@ -86,6 +86,16 @@ config IP6_NF_MATCH_FRAG
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_MATCH_OPTS
+	tristate '"hbh" hop-by-hop and "dst" opts header match support'
+	depends on IP6_NF_IPTABLES
+	depends on NETFILTER_ADVANCED
+	help
+	  This allows one to match packets based on the hop-by-hop
+	  and destination options headers of a packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config IP6_NF_MATCH_HL
 	tristate '"hl" match support'
 	depends on IP6_NF_IPTABLES
@@ -106,15 +116,6 @@ config IP6_NF_MATCH_IPV6HEADER
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_MATCH_AH
-	tristate '"ah" match support'
-	depends on IP6_NF_IPTABLES
-	depends on NETFILTER_ADVANCED
-	help
-	  This module allows one to match AH packets.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP6_NF_MATCH_MH
 	tristate '"mh" match support'
 	depends on IP6_NF_IPTABLES
@@ -124,14 +125,13 @@ config IP6_NF_MATCH_MH
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_MATCH_EUI64
-	tristate '"eui64" address check'
+config IP6_NF_MATCH_RT
+	tristate '"rt" Routing header match support'
 	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
-	  This module performs checking on the IPv6 source address
-	  Compares the last 64 bits with the EUI64 (delivered
-	  from the MAC address) address
+	  rt matching allows you to match packets based on the routing
+	  header of the packet.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index de18bba619f..9ad74e8bc5b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -340,6 +340,18 @@ config NETFILTER_XT_TARGET_CONNMARK
 	  <file:Documentation/kbuild/modules.txt>.  The module will be called
 	  ipt_CONNMARK.ko.  If unsure, say `N'.
 
+config NETFILTER_XT_TARGET_CONNSECMARK
+	tristate '"CONNSECMARK" target support'
+	depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK
+	default m if NETFILTER_ADVANCED=n
+	help
+	  The CONNSECMARK target copies security markings from packets
+	  to connections, and restores security markings from connections
+	  to packets (if the packets are not already marked).  This would
+	  normally be used in conjunction with the SECMARK target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_DSCP
 	tristate '"DSCP" and "TOS" target support'
 	depends on NETFILTER_XTABLES
@@ -371,18 +383,6 @@ config NETFILTER_XT_TARGET_MARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config NETFILTER_XT_TARGET_NFQUEUE
-	tristate '"NFQUEUE" target Support'
-	depends on NETFILTER_XTABLES
-	depends on NETFILTER_ADVANCED
-	help
-	  This target replaced the old obsolete QUEUE target.
-
-	  As opposed to QUEUE, it supports 65535 different queues,
-	  not just one.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config NETFILTER_XT_TARGET_NFLOG
 	tristate '"NFLOG" target support'
 	depends on NETFILTER_XTABLES
@@ -395,6 +395,18 @@ config NETFILTER_XT_TARGET_NFLOG
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_TARGET_NFQUEUE
+	tristate '"NFQUEUE" target Support'
+	depends on NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	help
+	  This target replaced the old obsolete QUEUE target.
+
+	  As opposed to QUEUE, it supports 65535 different queues,
+	  not just one.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_NOTRACK
 	tristate  '"NOTRACK" target support'
 	depends on NETFILTER_XTABLES
@@ -459,18 +471,6 @@ config NETFILTER_XT_TARGET_SECMARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config NETFILTER_XT_TARGET_CONNSECMARK
-	tristate '"CONNSECMARK" target support'
-	depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK
-	default m if NETFILTER_ADVANCED=n
-	help
-	  The CONNSECMARK target copies security markings from packets
-	  to connections, and restores security markings from connections
-	  to packets (if the packets are not already marked).  This would
-	  normally be used in conjunction with the SECMARK target.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config NETFILTER_XT_TARGET_TCPMSS
 	tristate '"TCPMSS" target support'
 	depends on NETFILTER_XTABLES && (IPV6 || IPV6=n)
@@ -607,6 +607,21 @@ config NETFILTER_XT_MATCH_ESP
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_HASHLIMIT
+	tristate '"hashlimit" match support'
+	depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `hashlimit' match.
+
+	  As opposed to `limit', this match dynamically creates a hash table
+	  of limit buckets, based on your selection of source/destination
+	  addresses and/or ports.
+
+	  It enables you to express policies like `10kpps for any given
+	  destination address' or `500pps from any given source address'
+	  with a single rule.
+
 config NETFILTER_XT_MATCH_HELPER
 	tristate '"helper" match support'
 	depends on NETFILTER_XTABLES
@@ -671,6 +686,17 @@ config NETFILTER_XT_MATCH_MARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_MULTIPORT
+	tristate '"multiport" Multiple port match support'
+	depends on NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	help
+	  Multiport matching allows you to match TCP or UDP packets based on
+	  a series of source or destination ports: normally a rule can only
+	  match a single range of ports.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_OWNER
 	tristate '"owner" match support'
 	depends on NETFILTER_XTABLES
@@ -691,17 +717,6 @@ config NETFILTER_XT_MATCH_POLICY
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config NETFILTER_XT_MATCH_MULTIPORT
-	tristate '"multiport" Multiple port match support'
-	depends on NETFILTER_XTABLES
-	depends on NETFILTER_ADVANCED
-	help
-	  Multiport matching allows you to match TCP or UDP packets based on
-	  a series of source or destination ports: normally a rule can only
-	  match a single range of ports.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config NETFILTER_XT_MATCH_PHYSDEV
 	tristate '"physdev" match support'
 	depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER
@@ -884,20 +899,5 @@ config NETFILTER_XT_MATCH_U32
 
 	  Details and examples are in the kernel module source.
 
-config NETFILTER_XT_MATCH_HASHLIMIT
-	tristate '"hashlimit" match support'
-	depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
-	depends on NETFILTER_ADVANCED
-	help
-	  This option adds a `hashlimit' match.
-
-	  As opposed to `limit', this match dynamically creates a hash table
-	  of limit buckets, based on your selection of source/destination
-	  addresses and/or ports.
-
-	  It enables you to express policies like `10kpps for any given
-	  destination address' or `500pps from any given source address'
-	  with a single rule.
-
 endmenu
 
-- 
cgit v1.2.3


From c2df73de246ae75705af8ceed4f385b261dea108 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:18 +0200
Subject: netfilter: xtables: use "if" blocks in Kconfig

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig | 36 +++++++++-----------
 net/ipv6/netfilter/Kconfig | 17 +++-------
 net/netfilter/Kconfig      | 84 ++++++++++++----------------------------------
 3 files changed, 41 insertions(+), 96 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 087b8290684..3816e1dc929 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -61,10 +61,11 @@ config IP_NF_IPTABLES
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+if IP_NF_IPTABLES
+
 # The matches.
 config IP_NF_MATCH_ADDRTYPE
 	tristate '"addrtype" address type match support'
-	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option allows you to match what routing thinks of an address,
@@ -75,7 +76,6 @@ config IP_NF_MATCH_ADDRTYPE
 
 config IP_NF_MATCH_AH
 	tristate '"ah" match support'
-	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This match extension allows you to match a range of SPIs
@@ -85,7 +85,6 @@ config IP_NF_MATCH_AH
 
 config IP_NF_MATCH_ECN
 	tristate '"ecn" match support'
-	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `ECN' match, which allows you to match against
@@ -95,7 +94,6 @@ config IP_NF_MATCH_ECN
 
 config IP_NF_MATCH_TTL
 	tristate '"ttl" match support'
-	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
@@ -106,7 +104,6 @@ config IP_NF_MATCH_TTL
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
-	depends on IP_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  Packet filtering defines a table `filter', which has a series of
@@ -128,7 +125,6 @@ config IP_NF_TARGET_REJECT
 
 config IP_NF_TARGET_LOG
 	tristate "LOG target support"
-	depends on IP_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `LOG' target, which allows you to create rules in
@@ -138,7 +134,6 @@ config IP_NF_TARGET_LOG
 
 config IP_NF_TARGET_ULOG
 	tristate "ULOG target support"
-	depends on IP_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	---help---
 
@@ -159,7 +154,7 @@ config IP_NF_TARGET_ULOG
 # NAT + specific targets: nf_conntrack
 config NF_NAT
 	tristate "Full NAT"
-	depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4
+	depends on NF_CONNTRACK_IPV4
 	default m if NETFILTER_ADVANCED=n
 	help
 	  The Full NAT option allows masquerading, port forwarding and other
@@ -254,44 +249,43 @@ config NF_NAT_PROTO_SCTP
 
 config NF_NAT_FTP
 	tristate
-	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
+	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_FTP
 
 config NF_NAT_IRC
 	tristate
-	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
+	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_IRC
 
 config NF_NAT_TFTP
 	tristate
-	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
+	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_TFTP
 
 config NF_NAT_AMANDA
 	tristate
-	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
+	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_AMANDA
 
 config NF_NAT_PPTP
 	tristate
-	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
+	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_PPTP
 	select NF_NAT_PROTO_GRE
 
 config NF_NAT_H323
 	tristate
-	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
+	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_H323
 
 config NF_NAT_SIP
 	tristate
-	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
+	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_SIP
 
 # mangle + specific targets
 config IP_NF_MANGLE
 	tristate "Packet mangling"
-	depends on IP_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `mangle' table to iptables: see the man page for
@@ -346,7 +340,6 @@ config IP_NF_TARGET_TTL
 # raw + specific targets
 config IP_NF_RAW
 	tristate  'raw table support (required for NOTRACK/TRACE)'
-	depends on IP_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `raw' table to iptables. This table is the very
@@ -359,7 +352,6 @@ config IP_NF_RAW
 # security table for MAC policy
 config IP_NF_SECURITY
 	tristate "Security table"
-	depends on IP_NF_IPTABLES
 	depends on SECURITY
 	depends on NETFILTER_ADVANCED
 	help
@@ -368,6 +360,8 @@ config IP_NF_SECURITY
 	 
 	  If unsure, say N.
 
+endif # IP_NF_IPTABLES
+
 # ARP tables
 config IP_NF_ARPTABLES
 	tristate "ARP tables support"
@@ -380,9 +374,10 @@ config IP_NF_ARPTABLES
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+if IP_NF_ARPTABLES
+
 config IP_NF_ARPFILTER
 	tristate "ARP packet filtering"
-	depends on IP_NF_ARPTABLES
 	help
 	  ARP packet filtering defines a table `filter', which has a series of
 	  rules for simple ARP packet filtering at local input and
@@ -393,10 +388,11 @@ config IP_NF_ARPFILTER
 
 config IP_NF_ARP_MANGLE
 	tristate "ARP payload mangling"
-	depends on IP_NF_ARPTABLES
 	help
 	  Allows altering the ARP packet payload: source and destination
 	  hardware and network addresses.
 
+endif # IP_NF_ARPTABLES
+
 endmenu
 
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 91ffba08c29..53ea512c460 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -55,10 +55,11 @@ config IP6_NF_IPTABLES
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+if IP6_NF_IPTABLES
+
 # The simple matches.
 config IP6_NF_MATCH_AH
 	tristate '"ah" match support'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This module allows one to match AH packets.
@@ -67,7 +68,6 @@ config IP6_NF_MATCH_AH
 
 config IP6_NF_MATCH_EUI64
 	tristate '"eui64" address check'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This module performs checking on the IPv6 source address
@@ -78,7 +78,6 @@ config IP6_NF_MATCH_EUI64
 
 config IP6_NF_MATCH_FRAG
 	tristate '"frag" Fragmentation header match support'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  frag matching allows you to match packets based on the fragmentation
@@ -88,7 +87,6 @@ config IP6_NF_MATCH_FRAG
 
 config IP6_NF_MATCH_OPTS
 	tristate '"hbh" hop-by-hop and "dst" opts header match support'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This allows one to match packets based on the hop-by-hop
@@ -98,7 +96,6 @@ config IP6_NF_MATCH_OPTS
 
 config IP6_NF_MATCH_HL
 	tristate '"hl" match support'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  HL matching allows you to match packets based on the hop
@@ -108,7 +105,6 @@ config IP6_NF_MATCH_HL
 
 config IP6_NF_MATCH_IPV6HEADER
 	tristate '"ipv6header" IPv6 Extension Headers Match'
-	depends on IP6_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This module allows one to match packets based upon
@@ -118,7 +114,6 @@ config IP6_NF_MATCH_IPV6HEADER
 
 config IP6_NF_MATCH_MH
 	tristate '"mh" match support'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This module allows one to match MH packets.
@@ -127,7 +122,6 @@ config IP6_NF_MATCH_MH
 
 config IP6_NF_MATCH_RT
 	tristate '"rt" Routing header match support'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  rt matching allows you to match packets based on the routing
@@ -138,7 +132,6 @@ config IP6_NF_MATCH_RT
 # The targets
 config IP6_NF_TARGET_LOG
 	tristate "LOG target support"
-	depends on IP6_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `LOG' target, which allows you to create rules in
@@ -148,7 +141,6 @@ config IP6_NF_TARGET_LOG
 
 config IP6_NF_FILTER
 	tristate "Packet filtering"
-	depends on IP6_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  Packet filtering defines a table `filter', which has a series of
@@ -170,7 +162,6 @@ config IP6_NF_TARGET_REJECT
 
 config IP6_NF_MANGLE
 	tristate "Packet mangling"
-	depends on IP6_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `mangle' table to iptables: see the man page for
@@ -198,7 +189,6 @@ config IP6_NF_TARGET_HL
 
 config IP6_NF_RAW
 	tristate  'raw table support (required for TRACE)'
-	depends on IP6_NF_IPTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `raw' table to ip6tables. This table is the very
@@ -211,7 +201,6 @@ config IP6_NF_RAW
 # security table for MAC policy
 config IP6_NF_SECURITY
        tristate "Security table"
-       depends on IP6_NF_IPTABLES
        depends on SECURITY
        depends on NETFILTER_ADVANCED
        help
@@ -220,5 +209,7 @@ config IP6_NF_SECURITY
         
          If unsure, say N.
 
+endif # IP6_NF_IPTABLES
+
 endmenu
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9ad74e8bc5b..899e78051d8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -38,10 +38,11 @@ config NF_CONNTRACK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+if NF_CONNTRACK
+
 config NF_CT_ACCT
 	bool "Connection tracking flow accounting"
 	depends on NETFILTER_ADVANCED
-	depends on NF_CONNTRACK
 	help
 	  If this option is enabled, the connection tracking code will
 	  keep per-flow packet and byte counters.
@@ -63,7 +64,6 @@ config NF_CT_ACCT
 config NF_CONNTRACK_MARK
 	bool  'Connection mark tracking support'
 	depends on NETFILTER_ADVANCED
-	depends on NF_CONNTRACK
 	help
 	  This option enables support for connection marks, used by the
 	  `CONNMARK' target and `connmark' match. Similar to the mark value
@@ -72,7 +72,7 @@ config NF_CONNTRACK_MARK
 
 config NF_CONNTRACK_SECMARK
 	bool  'Connection tracking security mark support'
-	depends on NF_CONNTRACK && NETWORK_SECMARK
+	depends on NETWORK_SECMARK
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option enables security markings to be applied to
@@ -85,7 +85,6 @@ config NF_CONNTRACK_SECMARK
 
 config NF_CONNTRACK_EVENTS
 	bool "Connection tracking events"
-	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	help
 	  If this option is enabled, the connection tracking code will
@@ -96,7 +95,7 @@ config NF_CONNTRACK_EVENTS
 
 config NF_CT_PROTO_DCCP
 	tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
-	depends on EXPERIMENTAL && NF_CONNTRACK
+	depends on EXPERIMENTAL
 	depends on NETFILTER_ADVANCED
 	default IP_DCCP
 	help
@@ -107,11 +106,10 @@ config NF_CT_PROTO_DCCP
 
 config NF_CT_PROTO_GRE
 	tristate
-	depends on NF_CONNTRACK
 
 config NF_CT_PROTO_SCTP
 	tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
-	depends on EXPERIMENTAL && NF_CONNTRACK
+	depends on EXPERIMENTAL
 	depends on NETFILTER_ADVANCED
 	default IP_SCTP
 	help
@@ -123,7 +121,6 @@ config NF_CT_PROTO_SCTP
 
 config NF_CT_PROTO_UDPLITE
 	tristate 'UDP-Lite protocol connection tracking support'
-	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	help
 	  With this option enabled, the layer 3 independent connection
@@ -134,7 +131,6 @@ config NF_CT_PROTO_UDPLITE
 
 config NF_CONNTRACK_AMANDA
 	tristate "Amanda backup protocol support"
-	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select TEXTSEARCH
 	select TEXTSEARCH_KMP
@@ -150,7 +146,6 @@ config NF_CONNTRACK_AMANDA
 
 config NF_CONNTRACK_FTP
 	tristate "FTP protocol support"
-	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
 	help
 	  Tracking FTP connections is problematic: special helpers are
@@ -165,7 +160,7 @@ config NF_CONNTRACK_FTP
 
 config NF_CONNTRACK_H323
 	tristate "H.323 protocol support"
-	depends on NF_CONNTRACK && (IPV6 || IPV6=n)
+	depends on (IPV6 || IPV6=n)
 	depends on NETFILTER_ADVANCED
 	help
 	  H.323 is a VoIP signalling protocol from ITU-T. As one of the most
@@ -185,7 +180,6 @@ config NF_CONNTRACK_H323
 
 config NF_CONNTRACK_IRC
 	tristate "IRC protocol support"
-	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
 	help
 	  There is a commonly-used extension to IRC called
@@ -201,7 +195,6 @@ config NF_CONNTRACK_IRC
 
 config NF_CONNTRACK_NETBIOS_NS
 	tristate "NetBIOS name service protocol support"
-	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	help
 	  NetBIOS name service requests are sent as broadcast messages from an
@@ -221,7 +214,6 @@ config NF_CONNTRACK_NETBIOS_NS
 
 config NF_CONNTRACK_PPTP
 	tristate "PPtP protocol support"
-	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select NF_CT_PROTO_GRE
 	help
@@ -241,7 +233,7 @@ config NF_CONNTRACK_PPTP
 
 config NF_CONNTRACK_SANE
 	tristate "SANE protocol support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && NF_CONNTRACK
+	depends on EXPERIMENTAL
 	depends on NETFILTER_ADVANCED
 	help
 	  SANE is a protocol for remote access to scanners as implemented
@@ -255,7 +247,6 @@ config NF_CONNTRACK_SANE
 
 config NF_CONNTRACK_SIP
 	tristate "SIP protocol support"
-	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
 	help
 	  SIP is an application-layer control protocol that can establish,
@@ -268,7 +259,6 @@ config NF_CONNTRACK_SIP
 
 config NF_CONNTRACK_TFTP
 	tristate "TFTP protocol support"
-	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	help
 	  TFTP connection tracking helper, this is required depending
@@ -280,7 +270,6 @@ config NF_CONNTRACK_TFTP
 
 config NF_CT_NETLINK
 	tristate 'Connection tracking netlink interface'
-	depends on NF_CONNTRACK
 	select NETFILTER_NETLINK
 	depends on NF_NAT=n || NF_NAT
 	default m if NETFILTER_ADVANCED=n
@@ -302,6 +291,8 @@ config NETFILTER_TPROXY
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+endif # NF_CONNTRACK
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
@@ -309,11 +300,12 @@ config NETFILTER_XTABLES
 	  This is required if you intend to use any of ip_tables,
 	  ip6_tables or arp_tables.
 
+if NETFILTER_XTABLES
+
 # alphabetically ordered list of targets
 
 config NETFILTER_XT_TARGET_CLASSIFY
 	tristate '"CLASSIFY" target support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `CLASSIFY' target, which enables the user to set
@@ -326,7 +318,6 @@ config NETFILTER_XT_TARGET_CLASSIFY
 
 config NETFILTER_XT_TARGET_CONNMARK
 	tristate  '"CONNMARK" target support'
-	depends on NETFILTER_XTABLES
 	depends on IP_NF_MANGLE || IP6_NF_MANGLE
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
@@ -342,7 +333,7 @@ config NETFILTER_XT_TARGET_CONNMARK
 
 config NETFILTER_XT_TARGET_CONNSECMARK
 	tristate '"CONNSECMARK" target support'
-	depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK
+	depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK
 	default m if NETFILTER_ADVANCED=n
 	help
 	  The CONNSECMARK target copies security markings from packets
@@ -354,7 +345,6 @@ config NETFILTER_XT_TARGET_CONNSECMARK
 
 config NETFILTER_XT_TARGET_DSCP
 	tristate '"DSCP" and "TOS" target support'
-	depends on NETFILTER_XTABLES
 	depends on IP_NF_MANGLE || IP6_NF_MANGLE
 	depends on NETFILTER_ADVANCED
 	help
@@ -371,7 +361,6 @@ config NETFILTER_XT_TARGET_DSCP
 
 config NETFILTER_XT_TARGET_MARK
 	tristate '"MARK" target support'
-	depends on NETFILTER_XTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `MARK' target, which allows you to create rules
@@ -385,7 +374,6 @@ config NETFILTER_XT_TARGET_MARK
 
 config NETFILTER_XT_TARGET_NFLOG
 	tristate '"NFLOG" target support'
-	depends on NETFILTER_XTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option enables the NFLOG target, which allows to LOG
@@ -397,7 +385,6 @@ config NETFILTER_XT_TARGET_NFLOG
 
 config NETFILTER_XT_TARGET_NFQUEUE
 	tristate '"NFQUEUE" target Support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This target replaced the old obsolete QUEUE target.
@@ -409,7 +396,6 @@ config NETFILTER_XT_TARGET_NFQUEUE
 
 config NETFILTER_XT_TARGET_NOTRACK
 	tristate  '"NOTRACK" target support'
-	depends on NETFILTER_XTABLES
 	depends on IP_NF_RAW || IP6_NF_RAW
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
@@ -424,7 +410,6 @@ config NETFILTER_XT_TARGET_NOTRACK
 
 config NETFILTER_XT_TARGET_RATEEST
 	tristate '"RATEEST" target support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `RATEEST' target, which allows to measure
@@ -450,7 +435,6 @@ config NETFILTER_XT_TARGET_TPROXY
 
 config NETFILTER_XT_TARGET_TRACE
 	tristate  '"TRACE" target support'
-	depends on NETFILTER_XTABLES
 	depends on IP_NF_RAW || IP6_NF_RAW
 	depends on NETFILTER_ADVANCED
 	help
@@ -463,7 +447,7 @@ config NETFILTER_XT_TARGET_TRACE
 
 config NETFILTER_XT_TARGET_SECMARK
 	tristate '"SECMARK" target support'
-	depends on NETFILTER_XTABLES && NETWORK_SECMARK
+	depends on NETWORK_SECMARK
 	default m if NETFILTER_ADVANCED=n
 	help
 	  The SECMARK target allows security marking of network
@@ -473,7 +457,7 @@ config NETFILTER_XT_TARGET_SECMARK
 
 config NETFILTER_XT_TARGET_TCPMSS
 	tristate '"TCPMSS" target support'
-	depends on NETFILTER_XTABLES && (IPV6 || IPV6=n)
+	depends on (IPV6 || IPV6=n)
 	default m if NETFILTER_ADVANCED=n
 	---help---
 	  This option adds a `TCPMSS' target, which allows you to alter the
@@ -500,7 +484,7 @@ config NETFILTER_XT_TARGET_TCPMSS
 
 config NETFILTER_XT_TARGET_TCPOPTSTRIP
 	tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)'
-	depends on EXPERIMENTAL && NETFILTER_XTABLES
+	depends on EXPERIMENTAL
 	depends on IP_NF_MANGLE || IP6_NF_MANGLE
 	depends on NETFILTER_ADVANCED
 	help
@@ -509,7 +493,6 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP
 
 config NETFILTER_XT_MATCH_COMMENT
 	tristate  '"comment" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `comment' dummy-match, which allows you to put
@@ -520,7 +503,6 @@ config NETFILTER_XT_MATCH_COMMENT
 
 config NETFILTER_XT_MATCH_CONNBYTES
 	tristate  '"connbytes" per-connection counter match support'
-	depends on NETFILTER_XTABLES
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select NF_CT_ACCT
@@ -533,7 +515,6 @@ config NETFILTER_XT_MATCH_CONNBYTES
 
 config NETFILTER_XT_MATCH_CONNLIMIT
 	tristate '"connlimit" match support"'
-	depends on NETFILTER_XTABLES
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	---help---
@@ -542,7 +523,6 @@ config NETFILTER_XT_MATCH_CONNLIMIT
 
 config NETFILTER_XT_MATCH_CONNMARK
 	tristate  '"connmark" connection mark match support'
-	depends on NETFILTER_XTABLES
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select NF_CONNTRACK_MARK
@@ -556,7 +536,6 @@ config NETFILTER_XT_MATCH_CONNMARK
 
 config NETFILTER_XT_MATCH_CONNTRACK
 	tristate '"conntrack" connection tracking match support'
-	depends on NETFILTER_XTABLES
 	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
 	help
@@ -570,7 +549,6 @@ config NETFILTER_XT_MATCH_CONNTRACK
 
 config NETFILTER_XT_MATCH_DCCP
 	tristate '"dccp" protocol match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	default IP_DCCP
 	help
@@ -583,7 +561,6 @@ config NETFILTER_XT_MATCH_DCCP
 
 config NETFILTER_XT_MATCH_DSCP
 	tristate '"dscp" and "tos" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `DSCP' match, which allows you to match against
@@ -599,7 +576,6 @@ config NETFILTER_XT_MATCH_DSCP
 
 config NETFILTER_XT_MATCH_ESP
 	tristate '"esp" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This match extension allows you to match a range of SPIs
@@ -609,7 +585,7 @@ config NETFILTER_XT_MATCH_ESP
 
 config NETFILTER_XT_MATCH_HASHLIMIT
 	tristate '"hashlimit" match support'
-	depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+	depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `hashlimit' match.
@@ -624,7 +600,6 @@ config NETFILTER_XT_MATCH_HASHLIMIT
 
 config NETFILTER_XT_MATCH_HELPER
 	tristate '"helper" match support'
-	depends on NETFILTER_XTABLES
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	help
@@ -635,7 +610,6 @@ config NETFILTER_XT_MATCH_HELPER
 
 config NETFILTER_XT_MATCH_IPRANGE
 	tristate '"iprange" address range match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	---help---
 	This option adds a "iprange" match, which allows you to match based on
@@ -646,7 +620,6 @@ config NETFILTER_XT_MATCH_IPRANGE
 
 config NETFILTER_XT_MATCH_LENGTH
 	tristate '"length" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option allows you to match the length of a packet against a
@@ -656,7 +629,6 @@ config NETFILTER_XT_MATCH_LENGTH
 
 config NETFILTER_XT_MATCH_LIMIT
 	tristate '"limit" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  limit matching allows you to control the rate at which a rule can be
@@ -667,7 +639,6 @@ config NETFILTER_XT_MATCH_LIMIT
 
 config NETFILTER_XT_MATCH_MAC
 	tristate '"mac" address match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  MAC matching allows you to match packets based on the source
@@ -677,7 +648,6 @@ config NETFILTER_XT_MATCH_MAC
 
 config NETFILTER_XT_MATCH_MARK
 	tristate '"mark" match support'
-	depends on NETFILTER_XTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  Netfilter mark matching allows you to match packets based on the
@@ -688,7 +658,6 @@ config NETFILTER_XT_MATCH_MARK
 
 config NETFILTER_XT_MATCH_MULTIPORT
 	tristate '"multiport" Multiple port match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  Multiport matching allows you to match TCP or UDP packets based on
@@ -699,7 +668,6 @@ config NETFILTER_XT_MATCH_MULTIPORT
 
 config NETFILTER_XT_MATCH_OWNER
 	tristate '"owner" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	---help---
 	Socket owner matching allows you to match locally-generated packets
@@ -708,7 +676,7 @@ config NETFILTER_XT_MATCH_OWNER
 
 config NETFILTER_XT_MATCH_POLICY
 	tristate 'IPsec "policy" match support'
-	depends on NETFILTER_XTABLES && XFRM
+	depends on XFRM
 	default m if NETFILTER_ADVANCED=n
 	help
 	  Policy matching allows you to match packets based on the
@@ -719,7 +687,7 @@ config NETFILTER_XT_MATCH_POLICY
 
 config NETFILTER_XT_MATCH_PHYSDEV
 	tristate '"physdev" match support'
-	depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER
+	depends on BRIDGE && BRIDGE_NETFILTER
 	depends on NETFILTER_ADVANCED
 	help
 	  Physdev packet matching matches against the physical bridge ports
@@ -729,7 +697,6 @@ config NETFILTER_XT_MATCH_PHYSDEV
 
 config NETFILTER_XT_MATCH_PKTTYPE
 	tristate '"pkttype" packet type match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  Packet type matching allows you to match a packet by
@@ -742,7 +709,6 @@ config NETFILTER_XT_MATCH_PKTTYPE
 
 config NETFILTER_XT_MATCH_QUOTA
 	tristate '"quota" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `quota' match, which allows to match on a
@@ -753,7 +719,6 @@ config NETFILTER_XT_MATCH_QUOTA
 
 config NETFILTER_XT_MATCH_RATEEST
 	tristate '"rateest" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	select NETFILTER_XT_TARGET_RATEEST
 	help
@@ -764,7 +729,6 @@ config NETFILTER_XT_MATCH_RATEEST
 
 config NETFILTER_XT_MATCH_REALM
 	tristate  '"realm" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	select NET_CLS_ROUTE
 	help
@@ -779,7 +743,6 @@ config NETFILTER_XT_MATCH_REALM
 
 config NETFILTER_XT_MATCH_RECENT
 	tristate '"recent" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	---help---
 	This match is used for creating one or many lists of recently
@@ -797,7 +760,7 @@ config NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
 
 config NETFILTER_XT_MATCH_SCTP
 	tristate  '"sctp" protocol match support (EXPERIMENTAL)'
-	depends on NETFILTER_XTABLES && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	depends on NETFILTER_ADVANCED
 	default IP_SCTP
 	help
@@ -825,7 +788,6 @@ config NETFILTER_XT_MATCH_SOCKET
 
 config NETFILTER_XT_MATCH_STATE
 	tristate '"state" match support'
-	depends on NETFILTER_XTABLES
 	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
 	help
@@ -837,7 +799,6 @@ config NETFILTER_XT_MATCH_STATE
 
 config NETFILTER_XT_MATCH_STATISTIC
 	tristate '"statistic" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `statistic' match, which allows you to match
@@ -847,7 +808,6 @@ config NETFILTER_XT_MATCH_STATISTIC
 
 config NETFILTER_XT_MATCH_STRING
 	tristate  '"string" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	select TEXTSEARCH
 	select TEXTSEARCH_KMP
@@ -861,7 +821,6 @@ config NETFILTER_XT_MATCH_STRING
 
 config NETFILTER_XT_MATCH_TCPMSS
 	tristate '"tcpmss" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	help
 	  This option adds a `tcpmss' match, which allows you to examine the
@@ -872,7 +831,6 @@ config NETFILTER_XT_MATCH_TCPMSS
 
 config NETFILTER_XT_MATCH_TIME
 	tristate '"time" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	---help---
 	  This option adds a "time" match, which allows you to match based on
@@ -887,7 +845,6 @@ config NETFILTER_XT_MATCH_TIME
 
 config NETFILTER_XT_MATCH_U32
 	tristate '"u32" match support'
-	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	---help---
 	  u32 allows you to extract quantities of up to 4 bytes from a packet,
@@ -899,5 +856,6 @@ config NETFILTER_XT_MATCH_U32
 
 	  Details and examples are in the kernel module source.
 
-endmenu
+endif # NETFILTER_XTABLES
 
+endmenu
-- 
cgit v1.2.3


From f7108a20dee44e5bb037f9e48f6a207b42e6ae1c Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:18 +0200
Subject: netfilter: xtables: move extension arguments into compound structure
 (1/6)

The function signatures for Xtables extensions have grown over time.
It involves a lot of typing/replication, and also a bit of stack space
even if they are not used. Realize an NFWS2008 idea and pack them into
structs. The skb remains outside of the struct so gcc can continue to
apply its optimizations.

This patch does this for match extensions' match functions.

A few ambiguities have also been addressed. The "offset" parameter for
example has been renamed to "fragoff" (there are so many different
offsets already) and "protoff" to "thoff" (there is more than just one
protocol here, so clarify).

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_802_3.c     |  6 ++---
 net/bridge/netfilter/ebt_among.c     |  6 ++---
 net/bridge/netfilter/ebt_arp.c       |  6 ++---
 net/bridge/netfilter/ebt_ip.c        |  6 ++---
 net/bridge/netfilter/ebt_ip6.c       |  6 ++---
 net/bridge/netfilter/ebt_limit.c     |  6 ++---
 net/bridge/netfilter/ebt_mark_m.c    |  6 ++---
 net/bridge/netfilter/ebt_pkttype.c   |  7 ++----
 net/bridge/netfilter/ebt_stp.c       |  6 ++---
 net/bridge/netfilter/ebt_vlan.c      |  6 ++---
 net/bridge/netfilter/ebtables.c      | 16 ++++++++-----
 net/ipv4/netfilter/ip_tables.c       | 46 ++++++++++++++++--------------------
 net/ipv4/netfilter/ipt_addrtype.c    | 18 +++++---------
 net/ipv4/netfilter/ipt_ah.c          | 14 ++++-------
 net/ipv4/netfilter/ipt_ecn.c         |  9 +++----
 net/ipv4/netfilter/ipt_ttl.c         |  7 ++----
 net/ipv6/netfilter/ip6_tables.c      | 44 +++++++++++++---------------------
 net/ipv6/netfilter/ip6t_ah.c         | 11 ++++-----
 net/ipv6/netfilter/ip6t_eui64.c      |  9 +++----
 net/ipv6/netfilter/ip6t_frag.c       | 11 ++++-----
 net/ipv6/netfilter/ip6t_hbh.c        | 13 ++++------
 net/ipv6/netfilter/ip6t_hl.c         |  7 ++----
 net/ipv6/netfilter/ip6t_ipv6header.c |  7 ++----
 net/ipv6/netfilter/ip6t_mh.c         | 15 +++++-------
 net/ipv6/netfilter/ip6t_rt.c         | 11 ++++-----
 net/netfilter/xt_comment.c           |  5 +---
 net/netfilter/xt_connbytes.c         |  7 ++----
 net/netfilter/xt_connlimit.c         | 17 ++++++-------
 net/netfilter/xt_connmark.c          | 14 ++++-------
 net/netfilter/xt_conntrack.c         | 22 +++++++----------
 net/netfilter/xt_dccp.c              | 16 ++++++-------
 net/netfilter/xt_dscp.c              | 30 ++++++++---------------
 net/netfilter/xt_esp.c               | 13 ++++------
 net/netfilter/xt_hashlimit.c         | 22 +++++++----------
 net/netfilter/xt_helper.c            |  7 ++----
 net/netfilter/xt_iprange.c           | 21 +++++-----------
 net/netfilter/xt_length.c            | 14 ++++-------
 net/netfilter/xt_limit.c             |  7 ++----
 net/netfilter/xt_mac.c               |  7 ++----
 net/netfilter/xt_mark.c              | 13 ++++------
 net/netfilter/xt_multiport.c         | 26 ++++++++------------
 net/netfilter/xt_owner.c             | 21 +++++-----------
 net/netfilter/xt_physdev.c           |  7 ++----
 net/netfilter/xt_pkttype.c           | 11 ++++-----
 net/netfilter/xt_policy.c            | 11 ++++-----
 net/netfilter/xt_quota.c             |  7 ++----
 net/netfilter/xt_rateest.c           | 12 +++-------
 net/netfilter/xt_realm.c             |  7 ++----
 net/netfilter/xt_recent.c            | 17 ++++++-------
 net/netfilter/xt_sctp.c              | 16 ++++++-------
 net/netfilter/xt_socket.c            | 11 ++-------
 net/netfilter/xt_state.c             |  7 ++----
 net/netfilter/xt_statistic.c         |  7 ++----
 net/netfilter/xt_string.c            |  9 +++----
 net/netfilter/xt_tcpmss.c            | 13 ++++------
 net/netfilter/xt_tcpudp.c            | 36 ++++++++++++----------------
 net/netfilter/xt_time.c              |  6 ++---
 net/netfilter/xt_u32.c               |  7 ++----
 58 files changed, 265 insertions(+), 480 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 6fc2a59e09a..c9e1bc14951 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -13,11 +13,9 @@
 #include <linux/netfilter_bridge/ebt_802_3.h>
 
 static bool
-ebt_802_3_mt(const struct sk_buff *skb, const struct net_device *in,
-	     const struct net_device *out, const struct xt_match *match,
-	     const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_802_3_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_802_3_info *info = data;
+	const struct ebt_802_3_info *info = par->matchinfo;
 	const struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb);
 	__be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type;
 
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 084559e1840..0ad0db3e815 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -128,11 +128,9 @@ static int get_ip_src(const struct sk_buff *skb, __be32 *addr)
 }
 
 static bool
-ebt_among_mt(const struct sk_buff *skb, const struct net_device *in,
-	     const struct net_device *out, const struct xt_match *match,
-	     const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_among_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_among_info *info = data;
+	const struct ebt_among_info *info = par->matchinfo;
 	const char *dmac, *smac;
 	const struct ebt_mac_wormhash *wh_dst, *wh_src;
 	__be32 dip = 0, sip = 0;
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index a073dffe7a1..1ff8fa3a9e7 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -16,11 +16,9 @@
 #include <linux/netfilter_bridge/ebt_arp.h>
 
 static bool
-ebt_arp_mt(const struct sk_buff *skb, const struct net_device *in,
-	   const struct net_device *out, const struct xt_match *match,
-	   const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_arp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_arp_info *info = data;
+	const struct ebt_arp_info *info = par->matchinfo;
 	const struct arphdr *ah;
 	struct arphdr _arph;
 
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index b42c7ce799b..c70ea39840b 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -25,11 +25,9 @@ struct tcpudphdr {
 };
 
 static bool
-ebt_ip_mt(const struct sk_buff *skb, const struct net_device *in,
-	  const struct net_device *out, const struct xt_match *match,
-	  const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_ip_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_ip_info *info = data;
+	const struct ebt_ip_info *info = par->matchinfo;
 	const struct iphdr *ih;
 	struct iphdr _iph;
 	const struct tcpudphdr *pptr;
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 7bd98312967..5acee02de72 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -28,11 +28,9 @@ struct tcpudphdr {
 };
 
 static bool
-ebt_ip6_mt(const struct sk_buff *skb, const struct net_device *in,
-	   const struct net_device *out, const struct xt_match *match,
-	   const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_ip6_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_ip6_info *info = data;
+	const struct ebt_ip6_info *info = par->matchinfo;
 	const struct ipv6hdr *ih6;
 	struct ipv6hdr _ip6h;
 	const struct tcpudphdr *pptr;
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 58aaaa14906..9a3ec8cadaa 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -31,11 +31,9 @@ static DEFINE_SPINLOCK(limit_lock);
 #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
 
 static bool
-ebt_limit_mt(const struct sk_buff *skb, const struct net_device *in,
-	     const struct net_device *out, const struct xt_match *match,
-	     const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_limit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	struct ebt_limit_info *info = (void *)data;
+	struct ebt_limit_info *info = (void *)par->matchinfo;
 	unsigned long now = jiffies;
 
 	spin_lock_bh(&limit_lock);
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index aa6781c7f98..5b22ef96127 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -13,11 +13,9 @@
 #include <linux/netfilter_bridge/ebt_mark_m.h>
 
 static bool
-ebt_mark_mt(const struct sk_buff *skb, const struct net_device *in,
-	    const struct net_device *out, const struct xt_match *match,
-	    const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_mark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_mark_m_info *info = data;
+	const struct ebt_mark_m_info *info = par->matchinfo;
 
 	if (info->bitmask & EBT_MARK_OR)
 		return !!(skb->mark & info->mask) ^ info->invert;
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 1c04ce5a52c..b756f88fb10 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -13,12 +13,9 @@
 #include <linux/netfilter_bridge/ebt_pkttype.h>
 
 static bool
-ebt_pkttype_mt(const struct sk_buff *skb, const struct net_device *in,
-	       const struct net_device *out, const struct xt_match *match,
-	       const void *data, int offset, unsigned int protoff,
-	       bool *hotdrop)
+ebt_pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_pkttype_info *info = data;
+	const struct ebt_pkttype_info *info = par->matchinfo;
 
 	return (skb->pkt_type == info->pkt_type) ^ info->invert;
 }
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 28bb48b67a8..06d777c62c3 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -120,11 +120,9 @@ static bool ebt_filter_config(const struct ebt_stp_info *info,
 }
 
 static bool
-ebt_stp_mt(const struct sk_buff *skb, const struct net_device *in,
-	   const struct net_device *out, const struct xt_match *match,
-	   const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_stp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_stp_info *info = data;
+	const struct ebt_stp_info *info = par->matchinfo;
 	const struct stp_header *sp;
 	struct stp_header _stph;
 	const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00};
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 5addef6d62f..b05b4a81834 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -41,11 +41,9 @@ MODULE_LICENSE("GPL");
 #define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; }
 
 static bool
-ebt_vlan_mt(const struct sk_buff *skb, const struct net_device *in,
-	    const struct net_device *out, const struct xt_match *match,
-	    const void *data, int offset, unsigned int protoff, bool *hotdrop)
+ebt_vlan_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ebt_vlan_info *info = data;
+	const struct ebt_vlan_info *info = par->matchinfo;
 	const struct vlan_hdr *fp;
 	struct vlan_hdr _frame;
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 7ee72b71d3c..f8e1822f38d 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -74,11 +74,11 @@ static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
 }
 
 static inline int ebt_do_match (struct ebt_entry_match *m,
-   const struct sk_buff *skb, const struct net_device *in,
-   const struct net_device *out, bool *hotdrop)
+   const struct sk_buff *skb, struct xt_match_param *par)
 {
-	return m->u.match->match(skb, in, out, m->u.match,
-	       m->data, 0, 0, hotdrop);
+	par->match     = m->u.match;
+	par->matchinfo = m->data;
+	return m->u.match->match(skb, par);
 }
 
 static inline int ebt_dev_check(char *entry, const struct net_device *device)
@@ -155,6 +155,11 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 	char *base;
 	struct ebt_table_info *private;
 	bool hotdrop = false;
+	struct xt_match_param mtpar;
+
+	mtpar.in      = in;
+	mtpar.out     = out;
+	mtpar.hotdrop = &hotdrop;
 
 	read_lock_bh(&table->lock);
 	private = table->private;
@@ -175,8 +180,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 		if (ebt_basic_match(point, eth_hdr(skb), in, out))
 			goto letscontinue;
 
-		if (EBT_MATCH_ITERATE(point, ebt_do_match, skb,
-		    in, out, &hotdrop) != 0)
+		if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &mtpar) != 0)
 			goto letscontinue;
 		if (hotdrop) {
 			read_unlock_bh(&table->lock);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b4c74a7a807..99fdb59454f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -186,16 +186,14 @@ ipt_error(struct sk_buff *skb,
 
 /* Performance critical - called for every packet */
 static inline bool
-do_match(struct ipt_entry_match *m,
-	      const struct sk_buff *skb,
-	      const struct net_device *in,
-	      const struct net_device *out,
-	      int offset,
-	      bool *hotdrop)
+do_match(struct ipt_entry_match *m, const struct sk_buff *skb,
+	 struct xt_match_param *par)
 {
+	par->match     = m->u.kernel.match;
+	par->matchinfo = m->data;
+
 	/* Stop iteration if it doesn't match */
-	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
-				      offset, ip_hdrlen(skb), hotdrop))
+	if (!m->u.kernel.match->match(skb, par))
 		return true;
 	else
 		return false;
@@ -326,7 +324,6 @@ ipt_do_table(struct sk_buff *skb,
 	     struct xt_table *table)
 {
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
-	u_int16_t offset;
 	const struct iphdr *ip;
 	u_int16_t datalen;
 	bool hotdrop = false;
@@ -336,6 +333,7 @@ ipt_do_table(struct sk_buff *skb,
 	void *table_base;
 	struct ipt_entry *e, *back;
 	struct xt_table_info *private;
+	struct xt_match_param mtpar;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -348,7 +346,11 @@ ipt_do_table(struct sk_buff *skb,
 	 * things we don't know, ie. tcp syn flag or ports).  If the
 	 * rule is also a fragment-specific rule, non-fragments won't
 	 * match it. */
-	offset = ntohs(ip->frag_off) & IP_OFFSET;
+	mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+	mtpar.thoff   = ip_hdrlen(skb);
+	mtpar.hotdrop = &hotdrop;
+	mtpar.in      = in;
+	mtpar.out     = out;
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
@@ -362,12 +364,11 @@ ipt_do_table(struct sk_buff *skb,
 	do {
 		IP_NF_ASSERT(e);
 		IP_NF_ASSERT(back);
-		if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
+		if (ip_packet_match(ip, indev, outdev,
+		    &e->ip, mtpar.fragoff)) {
 			struct ipt_entry_target *t;
 
-			if (IPT_MATCH_ITERATE(e, do_match,
-					      skb, in, out,
-					      offset, &hotdrop) != 0)
+			if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
 			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
@@ -2116,30 +2117,23 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
 }
 
 static bool
-icmp_match(const struct sk_buff *skb,
-	   const struct net_device *in,
-	   const struct net_device *out,
-	   const struct xt_match *match,
-	   const void *matchinfo,
-	   int offset,
-	   unsigned int protoff,
-	   bool *hotdrop)
+icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct icmphdr *ic;
 	struct icmphdr _icmph;
-	const struct ipt_icmp *icmpinfo = matchinfo;
+	const struct ipt_icmp *icmpinfo = par->matchinfo;
 
 	/* Must not be a fragment. */
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph);
+	ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
 	if (ic == NULL) {
 		/* We've been asked to examine this packet, and we
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("Dropping evil ICMP tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 2c9d88a6c83..e60995e4c20 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -30,12 +30,9 @@ static inline bool match_type(const struct net_device *dev, __be32 addr,
 }
 
 static bool
-addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-	       const struct net_device *out, const struct xt_match *match,
-	       const void *matchinfo, int offset, unsigned int protoff,
-	       bool *hotdrop)
+addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ipt_addrtype_info *info = matchinfo;
+	const struct ipt_addrtype_info *info = par->matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
 	bool ret = true;
 
@@ -50,20 +47,17 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in,
 }
 
 static bool
-addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in,
-	       const struct net_device *out, const struct xt_match *match,
-	       const void *matchinfo, int offset, unsigned int protoff,
-	       bool *hotdrop)
+addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ipt_addrtype_info_v1 *info = matchinfo;
+	const struct ipt_addrtype_info_v1 *info = par->matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
 	const struct net_device *dev = NULL;
 	bool ret = true;
 
 	if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
-		dev = in;
+		dev = par->in;
 	else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
-		dev = out;
+		dev = par->out;
 
 	if (info->source)
 		ret &= match_type(dev, iph->saddr, info->source) ^
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index e2e993edd66..2fce19ef4f3 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -36,27 +36,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
 	return r;
 }
 
-static bool
-ah_mt(const struct sk_buff *skb, const struct net_device *in,
-      const struct net_device *out, const struct xt_match *match,
-      const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct ip_auth_hdr _ahdr;
 	const struct ip_auth_hdr *ah;
-	const struct ipt_ah *ahinfo = matchinfo;
+	const struct ipt_ah *ahinfo = par->matchinfo;
 
 	/* Must not be a fragment. */
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	ah = skb_header_pointer(skb, protoff,
-				sizeof(_ahdr), &_ahdr);
+	ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
 	if (ah == NULL) {
 		/* We've been asked to examine this packet, and we
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("Dropping evil AH tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 2c45b4be7c3..06915463150 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -67,12 +67,9 @@ static inline bool match_tcp(const struct sk_buff *skb,
 	return true;
 }
 
-static bool
-ecn_mt(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ipt_ecn_info *info = matchinfo;
+	const struct ipt_ecn_info *info = par->matchinfo;
 
 	if (info->operation & IPT_ECN_OP_MATCH_IP)
 		if (!match_ip(skb, info))
@@ -81,7 +78,7 @@ ecn_mt(const struct sk_buff *skb, const struct net_device *in,
 	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
 		if (ip_hdr(skb)->protocol != IPPROTO_TCP)
 			return false;
-		if (!match_tcp(skb, info, hotdrop))
+		if (!match_tcp(skb, info, par->hotdrop))
 			return false;
 	}
 
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index d4c3fdc2a79..297f1cbf4ff 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -18,12 +18,9 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: IPv4 TTL field match");
 MODULE_LICENSE("GPL");
 
-static bool
-ttl_mt(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ipt_ttl_info *info = matchinfo;
+	const struct ipt_ttl_info *info = par->matchinfo;
 	const u8 ttl = ip_hdr(skb)->ttl;
 
 	switch (info->mode) {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 12c41b8d365..cf2c5370a4e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -215,17 +215,14 @@ ip6t_error(struct sk_buff *skb,
 
 /* Performance critical - called for every packet */
 static inline bool
-do_match(struct ip6t_entry_match *m,
-	      const struct sk_buff *skb,
-	      const struct net_device *in,
-	      const struct net_device *out,
-	      int offset,
-	      unsigned int protoff,
-	      bool *hotdrop)
+do_match(struct ip6t_entry_match *m, const struct sk_buff *skb,
+	 struct xt_match_param *par)
 {
+	par->match     = m->u.kernel.match;
+	par->matchinfo = m->data;
+
 	/* Stop iteration if it doesn't match */
-	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
-				      offset, protoff, hotdrop))
+	if (!m->u.kernel.match->match(skb, par))
 		return true;
 	else
 		return false;
@@ -355,8 +352,6 @@ ip6t_do_table(struct sk_buff *skb,
 	      struct xt_table *table)
 {
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
-	int offset = 0;
-	unsigned int protoff = 0;
 	bool hotdrop = false;
 	/* Initializing verdict to NF_DROP keeps gcc happy. */
 	unsigned int verdict = NF_DROP;
@@ -364,6 +359,7 @@ ip6t_do_table(struct sk_buff *skb,
 	void *table_base;
 	struct ip6t_entry *e, *back;
 	struct xt_table_info *private;
+	struct xt_match_param mtpar;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -374,6 +370,9 @@ ip6t_do_table(struct sk_buff *skb,
 	 * things we don't know, ie. tcp syn flag or ports).  If the
 	 * rule is also a fragment-specific rule, non-fragments won't
 	 * match it. */
+	mtpar.hotdrop = &hotdrop;
+	mtpar.in      = in;
+	mtpar.out     = out;
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
@@ -388,12 +387,10 @@ ip6t_do_table(struct sk_buff *skb,
 		IP_NF_ASSERT(e);
 		IP_NF_ASSERT(back);
 		if (ip6_packet_match(skb, indev, outdev, &e->ipv6,
-			&protoff, &offset, &hotdrop)) {
+			&mtpar.thoff, &mtpar.fragoff, &hotdrop)) {
 			struct ip6t_entry_target *t;
 
-			if (IP6T_MATCH_ITERATE(e, do_match,
-					       skb, in, out,
-					       offset, protoff, &hotdrop) != 0)
+			if (IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
 			ADD_COUNTER(e->counters,
@@ -2141,30 +2138,23 @@ icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
 }
 
 static bool
-icmp6_match(const struct sk_buff *skb,
-	   const struct net_device *in,
-	   const struct net_device *out,
-	   const struct xt_match *match,
-	   const void *matchinfo,
-	   int offset,
-	   unsigned int protoff,
-	   bool *hotdrop)
+icmp6_match(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct icmp6hdr *ic;
 	struct icmp6hdr _icmph;
-	const struct ip6t_icmp *icmpinfo = matchinfo;
+	const struct ip6t_icmp *icmpinfo = par->matchinfo;
 
 	/* Must not be a fragment. */
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph);
+	ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
 	if (ic == NULL) {
 		/* We've been asked to examine this packet, and we
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("Dropping evil ICMP tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 061f89beeb6..a04f2b8396e 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -36,14 +36,11 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
 	return r;
 }
 
-static bool
-ah_mt6(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool ah_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct ip_auth_hdr _ah;
 	const struct ip_auth_hdr *ah;
-	const struct ip6t_ah *ahinfo = matchinfo;
+	const struct ip6t_ah *ahinfo = par->matchinfo;
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
 	int err;
@@ -51,13 +48,13 @@ ah_mt6(const struct sk_buff *skb, const struct net_device *in,
 	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = true;
+			*par->hotdrop = true;
 		return false;
 	}
 
 	ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
 	if (ah == NULL) {
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index ba38df1116f..db610bacbcc 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -20,18 +20,15 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 
 static bool
-eui64_mt6(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
+eui64_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	unsigned char eui64[8];
 	int i = 0;
 
 	if (!(skb_mac_header(skb) >= skb->head &&
 	      skb_mac_header(skb) + ETH_HLEN <= skb->data) &&
-	    offset != 0) {
-		*hotdrop = true;
+	    par->fragoff != 0) {
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 972f699af22..6951d0dacf4 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -35,27 +35,24 @@ id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
 }
 
 static bool
-frag_mt6(const struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, const struct xt_match *match,
-         const void *matchinfo, int offset, unsigned int protoff,
-         bool *hotdrop)
+frag_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct frag_hdr _frag;
 	const struct frag_hdr *fh;
-	const struct ip6t_frag *fraginfo = matchinfo;
+	const struct ip6t_frag *fraginfo = par->matchinfo;
 	unsigned int ptr;
 	int err;
 
 	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = true;
+			*par->hotdrop = true;
 		return false;
 	}
 
 	fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
 	if (fh == NULL) {
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index d5edb51a595..d3351978819 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -42,14 +42,11 @@ MODULE_ALIAS("ip6t_dst");
  */
 
 static bool
-hbh_mt6(const struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, const struct xt_match *match,
-        const void *matchinfo, int offset, unsigned int protoff,
-        bool *hotdrop)
+hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct ipv6_opt_hdr _optsh;
 	const struct ipv6_opt_hdr *oh;
-	const struct ip6t_opts *optinfo = matchinfo;
+	const struct ip6t_opts *optinfo = par->matchinfo;
 	unsigned int temp;
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
@@ -61,16 +58,16 @@ hbh_mt6(const struct sk_buff *skb, const struct net_device *in,
 	unsigned int optlen;
 	int err;
 
-	err = ipv6_find_hdr(skb, &ptr, match->data, NULL);
+	err = ipv6_find_hdr(skb, &ptr, par->match->data, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = true;
+			*par->hotdrop = true;
 		return false;
 	}
 
 	oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
 	if (oh == NULL) {
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c
index 25c1eb92fac..c964dca1132 100644
--- a/net/ipv6/netfilter/ip6t_hl.c
+++ b/net/ipv6/netfilter/ip6t_hl.c
@@ -19,12 +19,9 @@ MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
 MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field match");
 MODULE_LICENSE("GPL");
 
-static bool
-hl_mt6(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool hl_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ip6t_hl_info *info = matchinfo;
+	const struct ip6t_hl_info *info = par->matchinfo;
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 
 	switch (info->mode) {
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index ef0661aacea..6aaca511d47 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -27,12 +27,9 @@ MODULE_DESCRIPTION("Xtables: IPv6 header types match");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 
 static bool
-ipv6header_mt6(const struct sk_buff *skb, const struct net_device *in,
-               const struct net_device *out, const struct xt_match *match,
-               const void *matchinfo, int offset, unsigned int protoff,
-               bool *hotdrop)
+ipv6header_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ip6t_ipv6header_info *info = matchinfo;
+	const struct ip6t_ipv6header_info *info = par->matchinfo;
 	unsigned int temp;
 	int len;
 	u8 nexthdr;
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
index dd876274ff6..2803258b6d0 100644
--- a/net/ipv6/netfilter/ip6t_mh.c
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -37,32 +37,29 @@ type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert)
 	return (type >= min && type <= max) ^ invert;
 }
 
-static bool
-mh_mt6(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool mh_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct ip6_mh _mh;
 	const struct ip6_mh *mh;
-	const struct ip6t_mh *mhinfo = matchinfo;
+	const struct ip6t_mh *mhinfo = par->matchinfo;
 
 	/* Must not be a fragment. */
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	mh = skb_header_pointer(skb, protoff, sizeof(_mh), &_mh);
+	mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh);
 	if (mh == NULL) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil MH tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
 	if (mh->ip6mh_proto != IPPROTO_NONE) {
 		duprintf("Dropping invalid MH Payload Proto: %u\n",
 			 mh->ip6mh_proto);
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 7c544ae591d..9cf4b8a37af 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -36,14 +36,11 @@ segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
 	return r;
 }
 
-static bool
-rt_mt6(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool rt_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct ipv6_rt_hdr _route;
 	const struct ipv6_rt_hdr *rh;
-	const struct ip6t_rt *rtinfo = matchinfo;
+	const struct ip6t_rt *rtinfo = par->matchinfo;
 	unsigned int temp;
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
@@ -55,13 +52,13 @@ rt_mt6(const struct sk_buff *skb, const struct net_device *in,
 	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = true;
+			*par->hotdrop = true;
 		return false;
 	}
 
 	rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
 	if (rh == NULL) {
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
index fa211b2ab87..bd7aa57af42 100644
--- a/net/netfilter/xt_comment.c
+++ b/net/netfilter/xt_comment.c
@@ -16,10 +16,7 @@ MODULE_ALIAS("ipt_comment");
 MODULE_ALIAS("ip6t_comment");
 
 static bool
-comment_mt(const struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, const struct xt_match *match,
-           const void *matchinfo, int offset, unsigned int protooff,
-           bool *hotdrop)
+comment_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	/* We always match */
 	return true;
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index d2cd22a49c9..30c19b5fe90 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -17,12 +17,9 @@ MODULE_ALIAS("ipt_connbytes");
 MODULE_ALIAS("ip6t_connbytes");
 
 static bool
-connbytes_mt(const struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, const struct xt_match *match,
-             const void *matchinfo, int offset, unsigned int protoff,
-             bool *hotdrop)
+connbytes_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_connbytes_info *sinfo = matchinfo;
+	const struct xt_connbytes_info *sinfo = par->matchinfo;
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	u_int64_t what = 0;	/* initialize to make gcc happy */
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index bd00830ff69..8b8f70e7664 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -178,12 +178,9 @@ static int count_them(struct xt_connlimit_data *data,
 }
 
 static bool
-connlimit_mt(const struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, const struct xt_match *match,
-             const void *matchinfo, int offset, unsigned int protoff,
-             bool *hotdrop)
+connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_connlimit_info *info = matchinfo;
+	const struct xt_connlimit_info *info = par->matchinfo;
 	union nf_inet_addr addr;
 	struct nf_conntrack_tuple tuple;
 	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
@@ -195,10 +192,10 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in,
 	if (ct != NULL)
 		tuple_ptr = &ct->tuplehash[0].tuple;
 	else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
-				    match->family, &tuple))
+				    par->match->family, &tuple))
 		goto hotdrop;
 
-	if (match->family == NFPROTO_IPV6) {
+	if (par->match->family == NFPROTO_IPV6) {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
 		memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr));
 	} else {
@@ -208,19 +205,19 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in,
 
 	spin_lock_bh(&info->data->lock);
 	connections = count_them(info->data, tuple_ptr, &addr,
-	                         &info->mask, match);
+	                         &info->mask, par->match);
 	spin_unlock_bh(&info->data->lock);
 
 	if (connections < 0) {
 		/* kmalloc failed, drop it entirely */
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
 	return (connections > info->limit) ^ info->inverse;
 
  hotdrop:
-	*hotdrop = true;
+	*par->hotdrop = true;
 	return false;
 }
 
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 0577b8ff4e1..df4f4a865a5 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -34,12 +34,9 @@ MODULE_ALIAS("ipt_connmark");
 MODULE_ALIAS("ip6t_connmark");
 
 static bool
-connmark_mt(const struct sk_buff *skb, const struct net_device *in,
-            const struct net_device *out, const struct xt_match *match,
-            const void *matchinfo, int offset, unsigned int protoff,
-            bool *hotdrop)
+connmark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_connmark_mtinfo1 *info = matchinfo;
+	const struct xt_connmark_mtinfo1 *info = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
 
@@ -51,12 +48,9 @@ connmark_mt(const struct sk_buff *skb, const struct net_device *in,
 }
 
 static bool
-connmark_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-               const struct net_device *out, const struct xt_match *match,
-               const void *matchinfo, int offset, unsigned int protoff,
-               bool *hotdrop)
+connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_connmark_info *info = matchinfo;
+	const struct xt_connmark_info *info = par->matchinfo;
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 392b457f9c2..13a7e4eacdf 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -25,12 +25,9 @@ MODULE_ALIAS("ipt_conntrack");
 MODULE_ALIAS("ip6t_conntrack");
 
 static bool
-conntrack_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-                const struct net_device *out, const struct xt_match *match,
-                const void *matchinfo, int offset, unsigned int protoff,
-                bool *hotdrop)
+conntrack_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_conntrack_info *sinfo = matchinfo;
+	const struct xt_conntrack_info *sinfo = par->matchinfo;
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	unsigned int statebit;
@@ -205,12 +202,9 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo1 *info,
 }
 
 static bool
-conntrack_mt(const struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, const struct xt_match *match,
-             const void *matchinfo, int offset, unsigned int protoff,
-             bool *hotdrop)
+conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_conntrack_mtinfo1 *info = matchinfo;
+	const struct xt_conntrack_mtinfo1 *info = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
 	unsigned int statebit;
@@ -244,22 +238,22 @@ conntrack_mt(const struct sk_buff *skb, const struct net_device *in,
 		return false;
 
 	if (info->match_flags & XT_CONNTRACK_ORIGSRC)
-		if (conntrack_mt_origsrc(ct, info, match->family) ^
+		if (conntrack_mt_origsrc(ct, info, par->match->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_ORIGSRC))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_ORIGDST)
-		if (conntrack_mt_origdst(ct, info, match->family) ^
+		if (conntrack_mt_origdst(ct, info, par->match->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_ORIGDST))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_REPLSRC)
-		if (conntrack_mt_replsrc(ct, info, match->family) ^
+		if (conntrack_mt_replsrc(ct, info, par->match->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_REPLSRC))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_REPLDST)
-		if (conntrack_mt_repldst(ct, info, match->family) ^
+		if (conntrack_mt_repldst(ct, info, par->match->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_REPLDST))
 			return false;
 
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index 87971f47132..7aa30bb9105 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -93,20 +93,18 @@ match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff,
 }
 
 static bool
-dccp_mt(const struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, const struct xt_match *match,
-        const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+dccp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_dccp_info *info = matchinfo;
+	const struct xt_dccp_info *info = par->matchinfo;
 	const struct dccp_hdr *dh;
 	struct dccp_hdr _dh;
 
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh);
+	dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh);
 	if (dh == NULL) {
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
@@ -118,8 +116,8 @@ dccp_mt(const struct sk_buff *skb, const struct net_device *in,
 			XT_DCCP_DEST_PORTS, info->flags, info->invflags)
 		&& DCCHECK(match_types(dh, info->typemask),
 			   XT_DCCP_TYPE, info->flags, info->invflags)
-		&& DCCHECK(match_option(info->option, skb, protoff, dh,
-					hotdrop),
+		&& DCCHECK(match_option(info->option, skb, par->thoff, dh,
+					par->hotdrop),
 			   XT_DCCP_OPTION, info->flags, info->invflags);
 }
 
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 7f03aa13a95..57d61206135 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -26,23 +26,18 @@ MODULE_ALIAS("ipt_tos");
 MODULE_ALIAS("ip6t_tos");
 
 static bool
-dscp_mt(const struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, const struct xt_match *match,
-        const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+dscp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_dscp_info *info = matchinfo;
+	const struct xt_dscp_info *info = par->matchinfo;
 	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	return (dscp == info->dscp) ^ !!info->invert;
 }
 
 static bool
-dscp_mt6(const struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, const struct xt_match *match,
-         const void *matchinfo, int offset, unsigned int protoff,
-         bool *hotdrop)
+dscp_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_dscp_info *info = matchinfo;
+	const struct xt_dscp_info *info = par->matchinfo;
 	u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	return (dscp == info->dscp) ^ !!info->invert;
@@ -63,24 +58,19 @@ dscp_mt_check(const char *tablename, const void *info,
 	return true;
 }
 
-static bool tos_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-                      const struct net_device *out,
-                      const struct xt_match *match, const void *matchinfo,
-                      int offset, unsigned int protoff, bool *hotdrop)
+static bool
+tos_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ipt_tos_info *info = matchinfo;
+	const struct ipt_tos_info *info = par->matchinfo;
 
 	return (ip_hdr(skb)->tos == info->tos) ^ info->invert;
 }
 
-static bool tos_mt(const struct sk_buff *skb, const struct net_device *in,
-                   const struct net_device *out, const struct xt_match *match,
-                   const void *matchinfo, int offset, unsigned int protoff,
-                   bool *hotdrop)
+static bool tos_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_tos_match_info *info = matchinfo;
+	const struct xt_tos_match_info *info = par->matchinfo;
 
-	if (match->family == NFPROTO_IPV4)
+	if (par->match->family == NFPROTO_IPV4)
 		return ((ip_hdr(skb)->tos & info->tos_mask) ==
 		       info->tos_value) ^ !!info->invert;
 	else
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index 045c4deecaf..6d59f2e7c1c 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -42,26 +42,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
 	return r;
 }
 
-static bool
-esp_mt(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool esp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct ip_esp_hdr *eh;
 	struct ip_esp_hdr _esp;
-	const struct xt_esp *espinfo = matchinfo;
+	const struct xt_esp *espinfo = par->matchinfo;
 
 	/* Must not be a fragment. */
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	eh = skb_header_pointer(skb, protoff, sizeof(_esp), &_esp);
+	eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp);
 	if (eh == NULL) {
 		/* We've been asked to examine this packet, and we
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("Dropping evil ESP tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 7bae369603d..22a60a728cf 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -563,19 +563,16 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 }
 
 static bool
-hashlimit_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-                const struct net_device *out, const struct xt_match *match,
-                const void *matchinfo, int offset, unsigned int protoff,
-                bool *hotdrop)
+hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct xt_hashlimit_info *r =
-		((const struct xt_hashlimit_info *)matchinfo)->u.master;
+		((const struct xt_hashlimit_info *)par->matchinfo)->u.master;
 	struct xt_hashlimit_htable *hinfo = r->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
 	struct dsthash_dst dst;
 
-	if (hashlimit_init_dst(hinfo, &dst, skb, protoff) < 0)
+	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
 	spin_lock_bh(&hinfo->lock);
@@ -613,23 +610,20 @@ hashlimit_mt_v0(const struct sk_buff *skb, const struct net_device *in,
 	return false;
 
 hotdrop:
-	*hotdrop = true;
+	*par->hotdrop = true;
 	return false;
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, const struct xt_match *match,
-             const void *matchinfo, int offset, unsigned int protoff,
-             bool *hotdrop)
+hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_hashlimit_mtinfo1 *info = matchinfo;
+	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
 	struct dsthash_dst dst;
 
-	if (hashlimit_init_dst(hinfo, &dst, skb, protoff) < 0)
+	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
 	spin_lock_bh(&hinfo->lock);
@@ -666,7 +660,7 @@ hashlimit_mt(const struct sk_buff *skb, const struct net_device *in,
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
  hotdrop:
-	*hotdrop = true;
+	*par->hotdrop = true;
 	return false;
 }
 
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 134d94324eb..73bdc3ba13f 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -24,12 +24,9 @@ MODULE_ALIAS("ip6t_helper");
 
 
 static bool
-helper_mt(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
+helper_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_helper_info *info = matchinfo;
+	const struct xt_helper_info *info = par->matchinfo;
 	const struct nf_conn *ct;
 	const struct nf_conn_help *master_help;
 	const struct nf_conntrack_helper *helper;
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index a7498cc48dc..6f62c36948d 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -17,12 +17,9 @@
 #include <linux/netfilter_ipv4/ipt_iprange.h>
 
 static bool
-iprange_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-              const struct net_device *out, const struct xt_match *match,
-              const void *matchinfo, int offset, unsigned int protoff,
-              bool *hotdrop)
+iprange_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ipt_iprange_info *info = matchinfo;
+	const struct ipt_iprange_info *info = par->matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
 
 	if (info->flags & IPRANGE_SRC) {
@@ -55,12 +52,9 @@ iprange_mt_v0(const struct sk_buff *skb, const struct net_device *in,
 }
 
 static bool
-iprange_mt4(const struct sk_buff *skb, const struct net_device *in,
-            const struct net_device *out, const struct xt_match *match,
-            const void *matchinfo, int offset, unsigned int protoff,
-            bool *hotdrop)
+iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_iprange_mtinfo *info = matchinfo;
+	const struct xt_iprange_mtinfo *info = par->matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
 	bool m;
 
@@ -111,12 +105,9 @@ iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b)
 }
 
 static bool
-iprange_mt6(const struct sk_buff *skb, const struct net_device *in,
-            const struct net_device *out, const struct xt_match *match,
-            const void *matchinfo, int offset, unsigned int protoff,
-            bool *hotdrop)
+iprange_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_iprange_mtinfo *info = matchinfo;
+	const struct xt_iprange_mtinfo *info = par->matchinfo;
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	bool m;
 
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index b8612d1914b..c4871ca6c86 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -21,24 +21,18 @@ MODULE_ALIAS("ipt_length");
 MODULE_ALIAS("ip6t_length");
 
 static bool
-length_mt(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
+length_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_length_info *info = matchinfo;
+	const struct xt_length_info *info = par->matchinfo;
 	u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
 
 	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
 }
 
 static bool
-length_mt6(const struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, const struct xt_match *match,
-           const void *matchinfo, int offset, unsigned int protoff,
-           bool *hotdrop)
+length_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_length_info *info = matchinfo;
+	const struct xt_length_info *info = par->matchinfo;
 	const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) +
 				 sizeof(struct ipv6hdr);
 
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 00247bd1095..c475eac5dbe 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -58,13 +58,10 @@ static DEFINE_SPINLOCK(limit_lock);
 #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
 
 static bool
-limit_mt(const struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, const struct xt_match *match,
-         const void *matchinfo, int offset, unsigned int protoff,
-         bool *hotdrop)
+limit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct xt_rateinfo *r =
-		((const struct xt_rateinfo *)matchinfo)->master;
+		((const struct xt_rateinfo *)par->matchinfo)->master;
 	unsigned long now = jiffies;
 
 	spin_lock_bh(&limit_lock);
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 60db240098a..269f9d8aef5 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -24,12 +24,9 @@ MODULE_DESCRIPTION("Xtables: MAC address match");
 MODULE_ALIAS("ipt_mac");
 MODULE_ALIAS("ip6t_mac");
 
-static bool
-mac_mt(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool mac_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-    const struct xt_mac_info *info = matchinfo;
+    const struct xt_mac_info *info = par->matchinfo;
 
     /* Is mac pointer valid? */
     return skb_mac_header(skb) >= skb->head &&
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 96dd2b63b6b..88547614653 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -23,22 +23,17 @@ MODULE_ALIAS("ipt_mark");
 MODULE_ALIAS("ip6t_mark");
 
 static bool
-mark_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, const struct xt_match *match,
-           const void *matchinfo, int offset, unsigned int protoff,
-           bool *hotdrop)
+mark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_mark_info *info = matchinfo;
+	const struct xt_mark_info *info = par->matchinfo;
 
 	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
 
 static bool
-mark_mt(const struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, const struct xt_match *match,
-        const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+mark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_mark_mtinfo1 *info = matchinfo;
+	const struct xt_mark_mtinfo1 *info = par->matchinfo;
 
 	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index f6fe008ab8c..7087e291528 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -95,25 +95,22 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
 }
 
 static bool
-multiport_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-                const struct net_device *out, const struct xt_match *match,
-                const void *matchinfo, int offset, unsigned int protoff,
-                bool *hotdrop)
+multiport_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const __be16 *pptr;
 	__be16 _ports[2];
-	const struct xt_multiport *multiinfo = matchinfo;
+	const struct xt_multiport *multiinfo = par->matchinfo;
 
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports);
+	pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports);
 	if (pptr == NULL) {
 		/* We've been asked to examine this packet, and we
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
@@ -122,25 +119,22 @@ multiport_mt_v0(const struct sk_buff *skb, const struct net_device *in,
 }
 
 static bool
-multiport_mt(const struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, const struct xt_match *match,
-             const void *matchinfo, int offset, unsigned int protoff,
-             bool *hotdrop)
+multiport_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const __be16 *pptr;
 	__be16 _ports[2];
-	const struct xt_multiport_v1 *multiinfo = matchinfo;
+	const struct xt_multiport_v1 *multiinfo = par->matchinfo;
 
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports);
+	pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports);
 	if (pptr == NULL) {
 		/* We've been asked to examine this packet, and we
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index d1c3b7ae9b4..493b5eb8d14 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -21,12 +21,9 @@
 #include <linux/netfilter_ipv6/ip6t_owner.h>
 
 static bool
-owner_mt_v0(const struct sk_buff *skb, const struct net_device *in,
-            const struct net_device *out, const struct xt_match *match,
-            const void *matchinfo, int offset, unsigned int protoff,
-            bool *hotdrop)
+owner_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ipt_owner_info *info = matchinfo;
+	const struct ipt_owner_info *info = par->matchinfo;
 	const struct file *filp;
 
 	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
@@ -50,12 +47,9 @@ owner_mt_v0(const struct sk_buff *skb, const struct net_device *in,
 }
 
 static bool
-owner_mt6_v0(const struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, const struct xt_match *match,
-             const void *matchinfo, int offset, unsigned int protoff,
-             bool *hotdrop)
+owner_mt6_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct ip6t_owner_info *info = matchinfo;
+	const struct ip6t_owner_info *info = par->matchinfo;
 	const struct file *filp;
 
 	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
@@ -79,12 +73,9 @@ owner_mt6_v0(const struct sk_buff *skb, const struct net_device *in,
 }
 
 static bool
-owner_mt(const struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, const struct xt_match *match,
-         const void *matchinfo, int offset, unsigned int protoff,
-         bool *hotdrop)
+owner_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_owner_match_info *info = matchinfo;
+	const struct xt_owner_match_info *info = par->matchinfo;
 	const struct file *filp;
 
 	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 72a0bdd53fa..e980e179d4f 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -21,14 +21,11 @@ MODULE_ALIAS("ipt_physdev");
 MODULE_ALIAS("ip6t_physdev");
 
 static bool
-physdev_mt(const struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, const struct xt_match *match,
-           const void *matchinfo, int offset, unsigned int protoff,
-           bool *hotdrop)
+physdev_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	int i;
 	static const char nulldevname[IFNAMSIZ];
-	const struct xt_physdev_info *info = matchinfo;
+	const struct xt_physdev_info *info = par->matchinfo;
 	bool ret;
 	const char *indev, *outdev;
 	const struct nf_bridge_info *nf_bridge;
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 81e86d319a8..37753a37760 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -23,20 +23,17 @@ MODULE_ALIAS("ipt_pkttype");
 MODULE_ALIAS("ip6t_pkttype");
 
 static bool
-pkttype_mt(const struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, const struct xt_match *match,
-           const void *matchinfo, int offset, unsigned int protoff,
-           bool *hotdrop)
+pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_pkttype_info *info = matchinfo;
+	const struct xt_pkttype_info *info = par->matchinfo;
 	u_int8_t type;
 
 	if (skb->pkt_type != PACKET_LOOPBACK)
 		type = skb->pkt_type;
-	else if (match->family == NFPROTO_IPV4 &&
+	else if (par->match->family == NFPROTO_IPV4 &&
 	    ipv4_is_multicast(ip_hdr(skb)->daddr))
 		type = PACKET_MULTICAST;
-	else if (match->family == NFPROTO_IPV6 &&
+	else if (par->match->family == NFPROTO_IPV6 &&
 	    ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
 		type = PACKET_MULTICAST;
 	else
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index f1d514e9d0a..b0a00fb0511 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -110,18 +110,15 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
 }
 
 static bool
-policy_mt(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
+policy_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_policy_info *info = matchinfo;
+	const struct xt_policy_info *info = par->matchinfo;
 	int ret;
 
 	if (info->flags & XT_POLICY_MATCH_IN)
-		ret = match_policy_in(skb, info, match->family);
+		ret = match_policy_in(skb, info, par->match->family);
 	else
-		ret = match_policy_out(skb, info, match->family);
+		ret = match_policy_out(skb, info, par->match->family);
 
 	if (ret < 0)
 		ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index a3c8798f0cc..3ab92666c14 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -18,13 +18,10 @@ MODULE_ALIAS("ip6t_quota");
 static DEFINE_SPINLOCK(quota_lock);
 
 static bool
-quota_mt(const struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, const struct xt_match *match,
-         const void *matchinfo, int offset, unsigned int protoff,
-         bool *hotdrop)
+quota_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	struct xt_quota_info *q =
-		((const struct xt_quota_info *)matchinfo)->master;
+		((const struct xt_quota_info *)par->matchinfo)->master;
 	bool ret = q->flags & XT_QUOTA_INVERT;
 
 	spin_lock_bh(&quota_lock);
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 4dcfd7353db..e9f64ef4565 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -14,16 +14,10 @@
 #include <net/netfilter/xt_rateest.h>
 
 
-static bool xt_rateest_mt(const struct sk_buff *skb,
-			  const struct net_device *in,
-			  const struct net_device *out,
-			  const struct xt_match *match,
-			  const void *matchinfo,
-			  int offset,
-			  unsigned int protoff,
-			  bool *hotdrop)
+static bool
+xt_rateest_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_rateest_match_info *info = matchinfo;
+	const struct xt_rateest_match_info *info = par->matchinfo;
 	struct gnet_stats_rate_est *r;
 	u_int32_t bps1, bps2, pps1, pps2;
 	bool ret = true;
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index ef65756d489..b25942110ed 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -22,12 +22,9 @@ MODULE_DESCRIPTION("Xtables: Routing realm match");
 MODULE_ALIAS("ipt_realm");
 
 static bool
-realm_mt(const struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, const struct xt_match *match,
-         const void *matchinfo, int offset, unsigned int protoff,
-         bool *hotdrop)
+realm_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_realm_info *info = matchinfo;
+	const struct xt_realm_info *info = par->matchinfo;
 	const struct dst_entry *dst = skb->dst;
 
 	return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 4a916e2624d..baeb90a5623 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -204,19 +204,16 @@ static void recent_table_flush(struct recent_table *t)
 }
 
 static bool
-recent_mt(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
+recent_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_recent_mtinfo *info = matchinfo;
+	const struct xt_recent_mtinfo *info = par->matchinfo;
 	struct recent_table *t;
 	struct recent_entry *e;
 	union nf_inet_addr addr = {};
 	u_int8_t ttl;
 	bool ret = info->invert;
 
-	if (match->family == NFPROTO_IPV4) {
+	if (par->match->family == NFPROTO_IPV4) {
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (info->side == XT_RECENT_DEST)
@@ -237,19 +234,19 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in,
 	}
 
 	/* use TTL as seen before forwarding */
-	if (out && !skb->sk)
+	if (par->out != NULL && skb->sk == NULL)
 		ttl++;
 
 	spin_lock_bh(&recent_lock);
 	t = recent_table_lookup(info->name);
-	e = recent_entry_lookup(t, &addr, match->family,
+	e = recent_entry_lookup(t, &addr, par->match->family,
 				(info->check_set & XT_RECENT_TTL) ? ttl : 0);
 	if (e == NULL) {
 		if (!(info->check_set & XT_RECENT_SET))
 			goto out;
-		e = recent_entry_init(t, &addr, match->family, ttl);
+		e = recent_entry_init(t, &addr, par->match->family, ttl);
 		if (e == NULL)
-			*hotdrop = true;
+			*par->hotdrop = true;
 		ret = !ret;
 		goto out;
 	}
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index ab67aca4d8f..b0014ab65da 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -117,23 +117,21 @@ match_packet(const struct sk_buff *skb,
 }
 
 static bool
-sctp_mt(const struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, const struct xt_match *match,
-        const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+sctp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_sctp_info *info = matchinfo;
+	const struct xt_sctp_info *info = par->matchinfo;
 	const sctp_sctphdr_t *sh;
 	sctp_sctphdr_t _sh;
 
-	if (offset) {
+	if (par->fragoff != 0) {
 		duprintf("Dropping non-first fragment.. FIXME\n");
 		return false;
 	}
 
-	sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh);
+	sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh);
 	if (sh == NULL) {
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 	duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
@@ -144,8 +142,8 @@ sctp_mt(const struct sk_buff *skb, const struct net_device *in,
 		&& SCCHECK(ntohs(sh->dest) >= info->dpts[0]
 			&& ntohs(sh->dest) <= info->dpts[1],
 			XT_SCTP_DEST_PORTS, info->flags, info->invflags)
-		&& SCCHECK(match_packet(skb, protoff + sizeof (sctp_sctphdr_t),
-					info, hotdrop),
+		&& SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t),
+					info, par->hotdrop),
 			   XT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
 }
 
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index ac9db17c7b9..02a8fed2108 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -86,14 +86,7 @@ extract_icmp_fields(const struct sk_buff *skb,
 
 
 static bool
-socket_mt(const struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  const struct xt_match *match,
-	  const void *matchinfo,
-	  int offset,
-	  unsigned int protoff,
-	  bool *hotdrop)
+socket_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct udphdr _hdr, *hp = NULL;
@@ -146,7 +139,7 @@ socket_mt(const struct sk_buff *skb,
 #endif
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
-				   saddr, daddr, sport, dport, in, false);
+				   saddr, daddr, sport, dport, par->in, false);
 	if (sk != NULL) {
 		bool wildcard = (inet_sk(sk)->rcv_saddr == 0);
 
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index f92f8bcc1e3..29f5a8a1b02 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -21,12 +21,9 @@ MODULE_ALIAS("ipt_state");
 MODULE_ALIAS("ip6t_state");
 
 static bool
-state_mt(const struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, const struct xt_match *match,
-         const void *matchinfo, int offset, unsigned int protoff,
-         bool *hotdrop)
+state_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_state_info *sinfo = matchinfo;
+	const struct xt_state_info *sinfo = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
 	unsigned int statebit;
 
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index f41a92322e6..dcadc491db2 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -25,12 +25,9 @@ MODULE_ALIAS("ip6t_statistic");
 static DEFINE_SPINLOCK(nth_lock);
 
 static bool
-statistic_mt(const struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, const struct xt_match *match,
-             const void *matchinfo, int offset, unsigned int protoff,
-             bool *hotdrop)
+statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
+	struct xt_statistic_info *info = (void *)par->matchinfo;
 	bool ret = info->flags & XT_STATISTIC_INVERT;
 
 	switch (info->mode) {
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 18d8884e737..33f2d29ca4f 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -22,18 +22,15 @@ MODULE_ALIAS("ipt_string");
 MODULE_ALIAS("ip6t_string");
 
 static bool
-string_mt(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
+string_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_string_info *conf = matchinfo;
+	const struct xt_string_info *conf = par->matchinfo;
 	struct ts_state state;
 	int invert;
 
 	memset(&state, 0, sizeof(struct ts_state));
 
-	invert = (match->revision == 0 ? conf->u.v0.invert :
+	invert = (par->match->revision == 0 ? conf->u.v0.invert :
 				    conf->u.v1.flags & XT_STRING_FLAG_INVERT);
 
 	return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 4791c7cbe5a..4809b34b10f 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -25,12 +25,9 @@ MODULE_ALIAS("ipt_tcpmss");
 MODULE_ALIAS("ip6t_tcpmss");
 
 static bool
-tcpmss_mt(const struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, const struct xt_match *match,
-          const void *matchinfo, int offset, unsigned int protoff,
-          bool *hotdrop)
+tcpmss_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_tcpmss_match_info *info = matchinfo;
+	const struct xt_tcpmss_match_info *info = par->matchinfo;
 	const struct tcphdr *th;
 	struct tcphdr _tcph;
 	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
@@ -39,7 +36,7 @@ tcpmss_mt(const struct sk_buff *skb, const struct net_device *in,
 	unsigned int i, optlen;
 
 	/* If we don't have the whole header, drop packet. */
-	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
 	if (th == NULL)
 		goto dropit;
 
@@ -52,7 +49,7 @@ tcpmss_mt(const struct sk_buff *skb, const struct net_device *in,
 		goto out;
 
 	/* Truncated options. */
-	op = skb_header_pointer(skb, protoff + sizeof(*th), optlen, _opt);
+	op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt);
 	if (op == NULL)
 		goto dropit;
 
@@ -76,7 +73,7 @@ out:
 	return info->invert;
 
 dropit:
-	*hotdrop = true;
+	*par->hotdrop = true;
 	return false;
 }
 
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 5a6268cbb9f..66cf71b1d59 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -68,25 +68,22 @@ tcp_find_option(u_int8_t option,
 	return invert;
 }
 
-static bool
-tcp_mt(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool tcp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct tcphdr *th;
 	struct tcphdr _tcph;
-	const struct xt_tcp *tcpinfo = matchinfo;
+	const struct xt_tcp *tcpinfo = par->matchinfo;
 
-	if (offset) {
+	if (par->fragoff != 0) {
 		/* To quote Alan:
 
 		   Don't allow a fragment of TCP 8 bytes in. Nobody normal
 		   causes this. Its a cracker trying to break in by doing a
 		   flag overwrite to pass the direction checks.
 		*/
-		if (offset == 1) {
+		if (par->fragoff == 1) {
 			duprintf("Dropping evil TCP offset=1 frag.\n");
-			*hotdrop = true;
+			*par->hotdrop = true;
 		}
 		/* Must not be a fragment. */
 		return false;
@@ -94,12 +91,12 @@ tcp_mt(const struct sk_buff *skb, const struct net_device *in,
 
 #define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg)))
 
-	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
@@ -117,13 +114,13 @@ tcp_mt(const struct sk_buff *skb, const struct net_device *in,
 		return false;
 	if (tcpinfo->option) {
 		if (th->doff * 4 < sizeof(_tcph)) {
-			*hotdrop = true;
+			*par->hotdrop = true;
 			return false;
 		}
-		if (!tcp_find_option(tcpinfo->option, skb, protoff,
+		if (!tcp_find_option(tcpinfo->option, skb, par->thoff,
 				     th->doff*4 - sizeof(_tcph),
 				     tcpinfo->invflags & XT_TCP_INV_OPTION,
-				     hotdrop))
+				     par->hotdrop))
 			return false;
 	}
 	return true;
@@ -141,25 +138,22 @@ tcp_mt_check(const char *tablename, const void *info,
 	return !(tcpinfo->invflags & ~XT_TCP_INV_MASK);
 }
 
-static bool
-udp_mt(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool udp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct udphdr *uh;
 	struct udphdr _udph;
-	const struct xt_udp *udpinfo = matchinfo;
+	const struct xt_udp *udpinfo = par->matchinfo;
 
 	/* Must not be a fragment. */
-	if (offset)
+	if (par->fragoff != 0)
 		return false;
 
-	uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph);
+	uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph);
 	if (uh == NULL) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil UDP tinygram.\n");
-		*hotdrop = true;
+		*par->hotdrop = true;
 		return false;
 	}
 
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 32d4c769caa..28599d3979c 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -153,11 +153,9 @@ static void localtime_3(struct xtm *r, time_t time)
 }
 
 static bool
-time_mt(const struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, const struct xt_match *match,
-        const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+time_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_time_info *info = matchinfo;
+	const struct xt_time_info *info = par->matchinfo;
 	unsigned int packet_time;
 	struct xtm current_time;
 	s64 stamp;
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
index a6b971dc5d3..24a52762450 100644
--- a/net/netfilter/xt_u32.c
+++ b/net/netfilter/xt_u32.c
@@ -87,12 +87,9 @@ static bool u32_match_it(const struct xt_u32 *data,
 	return true;
 }
 
-static bool
-u32_mt(const struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, const struct xt_match *match,
-       const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
+static bool u32_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_u32 *data = matchinfo;
+	const struct xt_u32 *data = par->matchinfo;
 	bool ret;
 
 	ret = u32_match_it(data, skb);
-- 
cgit v1.2.3


From 9b4fce7a3508a9776534188b6065b206a9608ccf Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:18 +0200
Subject: netfilter: xtables: move extension arguments into compound structure
 (2/6)

This patch does this for match extensions' checkentry functions.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_802_3.c     |  7 ++----
 net/bridge/netfilter/ebt_among.c     |  9 +++----
 net/bridge/netfilter/ebt_arp.c       |  9 +++----
 net/bridge/netfilter/ebt_ip.c        |  9 +++----
 net/bridge/netfilter/ebt_ip6.c       |  9 +++----
 net/bridge/netfilter/ebt_limit.c     |  7 ++----
 net/bridge/netfilter/ebt_mark_m.c    |  7 ++----
 net/bridge/netfilter/ebt_pkttype.c   |  7 ++----
 net/bridge/netfilter/ebt_stp.c       |  9 +++----
 net/bridge/netfilter/ebt_vlan.c      |  9 +++----
 net/bridge/netfilter/ebtables.c      | 19 +++++++++-----
 net/ipv4/netfilter/ip_tables.c       | 49 +++++++++++++++++-------------------
 net/ipv4/netfilter/ipt_addrtype.c    | 13 +++++-----
 net/ipv4/netfilter/ipt_ah.c          |  8 ++----
 net/ipv4/netfilter/ipt_ecn.c         |  9 +++----
 net/ipv6/netfilter/ip6_tables.c      | 48 +++++++++++++++++------------------
 net/ipv6/netfilter/ip6t_ah.c         |  8 ++----
 net/ipv6/netfilter/ip6t_frag.c       |  8 ++----
 net/ipv6/netfilter/ip6t_hbh.c        |  8 ++----
 net/ipv6/netfilter/ip6t_ipv6header.c |  7 ++----
 net/ipv6/netfilter/ip6t_mh.c         |  8 ++----
 net/ipv6/netfilter/ip6t_rt.c         |  8 ++----
 net/netfilter/x_tables.c             | 32 +++++++++++------------
 net/netfilter/xt_connbytes.c         | 14 ++++-------
 net/netfilter/xt_connlimit.c         | 13 ++++------
 net/netfilter/xt_connmark.c          | 20 ++++++---------
 net/netfilter/xt_conntrack.c         |  9 +++----
 net/netfilter/xt_dccp.c              |  7 ++----
 net/netfilter/xt_dscp.c              | 11 +++-----
 net/netfilter/xt_esp.c               |  8 ++----
 net/netfilter/xt_hashlimit.c         | 24 +++++++-----------
 net/netfilter/xt_helper.c            | 11 +++-----
 net/netfilter/xt_limit.c             |  7 ++----
 net/netfilter/xt_mark.c              |  7 ++----
 net/netfilter/xt_multiport.c         | 37 +++++++++------------------
 net/netfilter/xt_owner.c             | 14 +++--------
 net/netfilter/xt_physdev.c           | 13 ++++------
 net/netfilter/xt_policy.c            | 15 +++++------
 net/netfilter/xt_quota.c             |  7 ++----
 net/netfilter/xt_rateest.c           |  8 ++----
 net/netfilter/xt_recent.c            |  7 ++----
 net/netfilter/xt_sctp.c              |  7 ++----
 net/netfilter/xt_state.c             |  9 +++----
 net/netfilter/xt_statistic.c         |  7 ++----
 net/netfilter/xt_string.c            |  9 +++----
 net/netfilter/xt_tcpudp.c            | 16 +++---------
 net/netfilter/xt_time.c              |  7 ++----
 47 files changed, 218 insertions(+), 376 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index c9e1bc14951..bd91dc58d49 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -36,12 +36,9 @@ ebt_802_3_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ebt_802_3_mt_check(const char *table, const void *entry,
-		   const struct xt_match *match, void *data,
-		   unsigned int hook_mask)
+static bool ebt_802_3_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ebt_802_3_info *info = data;
+	const struct ebt_802_3_info *info = par->matchinfo;
 
 	if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK)
 		return false;
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 0ad0db3e815..b595f091f35 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -171,14 +171,11 @@ ebt_among_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ebt_among_mt_check(const char *table, const void *entry,
-		   const struct xt_match *match, void *data,
-		   unsigned int hook_mask)
+static bool ebt_among_mt_check(const struct xt_mtchk_param *par)
 {
+	const struct ebt_among_info *info = par->matchinfo;
 	const struct ebt_entry_match *em =
-		container_of(data, const struct ebt_entry_match, data);
-	const struct ebt_among_info *info = data;
+		container_of(par->matchinfo, const struct ebt_entry_match, data);
 	int expected_length = sizeof(struct ebt_among_info);
 	const struct ebt_mac_wormhash *wh_dst, *wh_src;
 	int err;
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index 1ff8fa3a9e7..b7ad60419f9 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -100,13 +100,10 @@ ebt_arp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ebt_arp_mt_check(const char *table, const void *entry,
-		 const struct xt_match *match, void *data,
-		 unsigned int hook_mask)
+static bool ebt_arp_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ebt_arp_info *info = data;
-	const struct ebt_entry *e = entry;
+	const struct ebt_arp_info *info = par->matchinfo;
+	const struct ebt_entry *e = par->entryinfo;
 
 	if ((e->ethproto != htons(ETH_P_ARP) &&
 	   e->ethproto != htons(ETH_P_RARP)) ||
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index c70ea39840b..d771bbfbcbe 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -77,13 +77,10 @@ ebt_ip_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ebt_ip_mt_check(const char *table, const void *entry,
-		const struct xt_match *match, void *data,
-		unsigned int hook_mask)
+static bool ebt_ip_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ebt_ip_info *info = data;
-	const struct ebt_entry *e = entry;
+	const struct ebt_ip_info *info = par->matchinfo;
+	const struct ebt_entry *e = par->entryinfo;
 
 	if (e->ethproto != htons(ETH_P_IP) ||
 	   e->invflags & EBT_IPROTO)
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 5acee02de72..784a6573876 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -90,13 +90,10 @@ ebt_ip6_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ebt_ip6_mt_check(const char *table, const void *entry,
-		 const struct xt_match *match, void *data,
-		 unsigned int hook_mask)
+static bool ebt_ip6_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ebt_entry *e = entry;
-	struct ebt_ip6_info *info = data;
+	const struct ebt_entry *e = par->entryinfo;
+	struct ebt_ip6_info *info = par->matchinfo;
 
 	if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
 		return false;
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 9a3ec8cadaa..f7bd9192ff0 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -64,12 +64,9 @@ user2credits(u_int32_t user)
 	return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE;
 }
 
-static bool
-ebt_limit_mt_check(const char *table, const void *e,
-		   const struct xt_match *match, void *data,
-		   unsigned int hook_mask)
+static bool ebt_limit_mt_check(const struct xt_mtchk_param *par)
 {
-	struct ebt_limit_info *info = data;
+	struct ebt_limit_info *info = par->matchinfo;
 
 	/* Check for overflow. */
 	if (info->burst == 0 ||
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index 5b22ef96127..ea570f214b1 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -22,12 +22,9 @@ ebt_mark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
 
-static bool
-ebt_mark_mt_check(const char *table, const void *e,
-		  const struct xt_match *match, void *data,
-		  unsigned int hook_mask)
+static bool ebt_mark_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ebt_mark_m_info *info = data;
+	const struct ebt_mark_m_info *info = par->matchinfo;
 
 	if (info->bitmask & ~EBT_MARK_MASK)
 		return false;
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index b756f88fb10..883e96e2a54 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -20,12 +20,9 @@ ebt_pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return (skb->pkt_type == info->pkt_type) ^ info->invert;
 }
 
-static bool
-ebt_pkttype_mt_check(const char *table, const void *e,
-		     const struct xt_match *match, void *data,
-		     unsigned int hook_mask)
+static bool ebt_pkttype_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ebt_pkttype_info *info = data;
+	const struct ebt_pkttype_info *info = par->matchinfo;
 
 	if (info->invert != 0 && info->invert != 1)
 		return false;
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 06d777c62c3..48527e62162 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -153,15 +153,12 @@ ebt_stp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ebt_stp_mt_check(const char *table, const void *entry,
-		 const struct xt_match *match, void *data,
-		 unsigned int hook_mask)
+static bool ebt_stp_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ebt_stp_info *info = data;
+	const struct ebt_stp_info *info = par->matchinfo;
 	const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
 	const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-	const struct ebt_entry *e = entry;
+	const struct ebt_entry *e = par->entryinfo;
 
 	if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
 	    !(info->bitmask & EBT_STP_MASK))
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index b05b4a81834..3dddd489328 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -84,13 +84,10 @@ ebt_vlan_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ebt_vlan_mt_check(const char *table, const void *entry,
-		  const struct xt_match *match, void *data,
-		  unsigned int hook_mask)
+static bool ebt_vlan_mt_check(const struct xt_mtchk_param *par)
 {
-	struct ebt_vlan_info *info = data;
-	const struct ebt_entry *e = entry;
+	struct ebt_vlan_info *info = par->matchinfo;
+	const struct ebt_entry *e = par->entryinfo;
 
 	/* Is it 802.1Q frame checked? */
 	if (e->ethproto != htons(ETH_P_8021Q)) {
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index f8e1822f38d..5ce37b2f5b8 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -324,9 +324,10 @@ find_table_lock(const char *name, int *error, struct mutex *mutex)
 }
 
 static inline int
-ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
-   const char *name, unsigned int hookmask, unsigned int *cnt)
+ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
+		unsigned int *cnt)
 {
+	const struct ebt_entry *e = par->entryinfo;
 	struct xt_match *match;
 	size_t left = ((char *)e + e->watchers_offset) - (char *)m;
 	int ret;
@@ -343,9 +344,10 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 		return -ENOENT;
 	m->u.match = match;
 
-	ret = xt_check_match(match, NFPROTO_BRIDGE, m->match_size,
-	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO,
-	      e, m->data);
+	par->match     = match;
+	par->matchinfo = m->data;
+	ret = xt_check_match(par, NFPROTO_BRIDGE, m->match_size,
+	      e->ethproto, e->invflags & EBT_IPROTO);
 	if (ret < 0) {
 		module_put(match->me);
 		return ret;
@@ -607,6 +609,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	unsigned int i, j, hook = 0, hookmask = 0;
 	size_t gap;
 	int ret;
+	struct xt_mtchk_param par;
 
 	/* don't mess with the struct ebt_entries */
 	if (e->bitmask == 0)
@@ -647,7 +650,11 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 			hookmask = cl_s[i - 1].hookmask;
 	}
 	i = 0;
-	ret = EBT_MATCH_ITERATE(e, ebt_check_match, e, name, hookmask, &i);
+
+	par.table     = name;
+	par.entryinfo = e;
+	par.hook_mask = hookmask;
+	ret = EBT_MATCH_ITERATE(e, ebt_check_match, &par, &i);
 	if (ret != 0)
 		goto cleanup_matches;
 	j = 0;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 99fdb59454f..4147298a6a8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -607,20 +607,20 @@ check_entry(struct ipt_entry *e, const char *name)
 }
 
 static int
-check_match(struct ipt_entry_match *m, const char *name,
-			      const struct ipt_ip *ip,
-			      unsigned int hookmask, unsigned int *i)
+check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
+	    unsigned int *i)
 {
-	struct xt_match *match;
+	const struct ipt_ip *ip = par->entryinfo;
 	int ret;
 
-	match = m->u.kernel.match;
-	ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m),
-			     name, hookmask, ip->proto,
-			     ip->invflags & IPT_INV_PROTO, ip, m->data);
+	par->match     = m->u.kernel.match;
+	par->matchinfo = m->data;
+
+	ret = xt_check_match(par, NFPROTO_IPV4, m->u.match_size - sizeof(*m),
+	      ip->proto, ip->invflags & IPT_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
-			 m->u.kernel.match->name);
+			 par.match->name);
 		return ret;
 	}
 	++*i;
@@ -628,10 +628,7 @@ check_match(struct ipt_entry_match *m, const char *name,
 }
 
 static int
-find_check_match(struct ipt_entry_match *m,
-		 const char *name,
-		 const struct ipt_ip *ip,
-		 unsigned int hookmask,
+find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
 		 unsigned int *i)
 {
 	struct xt_match *match;
@@ -646,7 +643,7 @@ find_check_match(struct ipt_entry_match *m,
 	}
 	m->u.kernel.match = match;
 
-	ret = check_match(m, name, ip, hookmask, i);
+	ret = check_match(m, par, i);
 	if (ret)
 		goto err;
 
@@ -683,14 +680,17 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
 	struct xt_target *target;
 	int ret;
 	unsigned int j;
+	struct xt_mtchk_param mtpar;
 
 	ret = check_entry(e, name);
 	if (ret)
 		return ret;
 
 	j = 0;
-	ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip,
-				e->comefrom, &j);
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ip;
+	mtpar.hook_mask = e->comefrom;
+	ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j);
 	if (ret != 0)
 		goto cleanup_matches;
 
@@ -1644,12 +1644,15 @@ static int
 compat_check_entry(struct ipt_entry *e, const char *name,
 				     unsigned int *i)
 {
+	struct xt_mtchk_param mtpar;
 	unsigned int j;
 	int ret;
 
 	j = 0;
-	ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip,
-				e->comefrom, &j);
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ip;
+	mtpar.hook_mask = e->comefrom;
+	ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j);
 	if (ret)
 		goto cleanup_matches;
 
@@ -2144,15 +2147,9 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
 				    !!(icmpinfo->invflags&IPT_ICMP_INV));
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-icmp_checkentry(const char *tablename,
-	   const void *entry,
-	   const struct xt_match *match,
-	   void *matchinfo,
-	   unsigned int hook_mask)
+static bool icmp_checkentry(const struct xt_mtchk_param *par)
 {
-	const struct ipt_icmp *icmpinfo = matchinfo;
+	const struct ipt_icmp *icmpinfo = par->matchinfo;
 
 	/* Must specify no unknown invflags */
 	return !(icmpinfo->invflags & ~IPT_ICMP_INV);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index e60995e4c20..88762f02779 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -68,12 +68,9 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ret;
 }
 
-static bool
-addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void,
-			  const struct xt_match *match, void *matchinfo,
-			  unsigned int hook_mask)
+static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
 {
-	struct ipt_addrtype_info_v1 *info = matchinfo;
+	struct ipt_addrtype_info_v1 *info = par->matchinfo;
 
 	if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
 	    info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
@@ -82,14 +79,16 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void,
 		return false;
 	}
 
-	if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) &&
+	if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
+	    (1 << NF_INET_LOCAL_IN)) &&
 	    info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
 		printk(KERN_ERR "ipt_addrtype: output interface limitation "
 				"not valid in PRE_ROUTING and INPUT\n");
 		return false;
 	}
 
-	if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) &&
+	if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
+	    (1 << NF_INET_LOCAL_OUT)) &&
 	    info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
 		printk(KERN_ERR "ipt_addrtype: input interface limitation "
 				"not valid in POST_ROUTING and OUTPUT\n");
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 2fce19ef4f3..0104c0b399d 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -61,13 +61,9 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			 !!(ahinfo->invflags & IPT_AH_INV_SPI));
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-ah_mt_check(const char *tablename, const void *ip_void,
-            const struct xt_match *match, void *matchinfo,
-            unsigned int hook_mask)
+static bool ah_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ipt_ah *ahinfo = matchinfo;
+	const struct ipt_ah *ahinfo = par->matchinfo;
 
 	/* Must specify no unknown invflags */
 	if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 06915463150..6289b64144c 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -85,13 +85,10 @@ static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-ecn_mt_check(const char *tablename, const void *ip_void,
-             const struct xt_match *match, void *matchinfo,
-             unsigned int hook_mask)
+static bool ecn_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ipt_ecn_info *info = matchinfo;
-	const struct ipt_ip *ip = ip_void;
+	const struct ipt_ecn_info *info = par->matchinfo;
+	const struct ipt_ip *ip = par->entryinfo;
 
 	if (info->operation & IPT_ECN_OP_MATCH_MASK)
 		return false;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index cf2c5370a4e..9c843e3777b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -629,20 +629,20 @@ check_entry(struct ip6t_entry *e, const char *name)
 	return 0;
 }
 
-static int check_match(struct ip6t_entry_match *m, const char *name,
-			      const struct ip6t_ip6 *ipv6,
-			      unsigned int hookmask, unsigned int *i)
+static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par,
+		       unsigned int *i)
 {
-	struct xt_match *match;
+	const struct ip6t_ip6 *ipv6 = par->entryinfo;
 	int ret;
 
-	match = m->u.kernel.match;
-	ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m),
-			     name, hookmask, ipv6->proto,
-			     ipv6->invflags & IP6T_INV_PROTO, ipv6, m->data);
+	par->match     = m->u.kernel.match;
+	par->matchinfo = m->data;
+
+	ret = xt_check_match(par, NFPROTO_IPV6, m->u.match_size - sizeof(*m),
+			     ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
-			 m->u.kernel.match->name);
+			 par.match->name);
 		return ret;
 	}
 	++*i;
@@ -650,10 +650,7 @@ static int check_match(struct ip6t_entry_match *m, const char *name,
 }
 
 static int
-find_check_match(struct ip6t_entry_match *m,
-		 const char *name,
-		 const struct ip6t_ip6 *ipv6,
-		 unsigned int hookmask,
+find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par,
 		 unsigned int *i)
 {
 	struct xt_match *match;
@@ -668,7 +665,7 @@ find_check_match(struct ip6t_entry_match *m,
 	}
 	m->u.kernel.match = match;
 
-	ret = check_match(m, name, ipv6, hookmask, i);
+	ret = check_match(m, par, i);
 	if (ret)
 		goto err;
 
@@ -705,14 +702,17 @@ find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
 	struct xt_target *target;
 	int ret;
 	unsigned int j;
+	struct xt_mtchk_param mtpar;
 
 	ret = check_entry(e, name);
 	if (ret)
 		return ret;
 
 	j = 0;
-	ret = IP6T_MATCH_ITERATE(e, find_check_match, name, &e->ipv6,
-				 e->comefrom, &j);
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ipv6;
+	mtpar.hook_mask = e->comefrom;
+	ret = IP6T_MATCH_ITERATE(e, find_check_match, &mtpar, &j);
 	if (ret != 0)
 		goto cleanup_matches;
 
@@ -1669,10 +1669,13 @@ static int compat_check_entry(struct ip6t_entry *e, const char *name,
 {
 	unsigned int j;
 	int ret;
+	struct xt_mtchk_param mtpar;
 
 	j = 0;
-	ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6,
-				 e->comefrom, &j);
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ipv6;
+	mtpar.hook_mask = e->comefrom;
+	ret = IP6T_MATCH_ITERATE(e, check_match, &mtpar, &j);
 	if (ret)
 		goto cleanup_matches;
 
@@ -2166,14 +2169,9 @@ icmp6_match(const struct sk_buff *skb, const struct xt_match_param *par)
 }
 
 /* Called when user tries to insert an entry of this type. */
-static bool
-icmp6_checkentry(const char *tablename,
-	   const void *entry,
-	   const struct xt_match *match,
-	   void *matchinfo,
-	   unsigned int hook_mask)
+static bool icmp6_checkentry(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_icmp *icmpinfo = matchinfo;
+	const struct ip6t_icmp *icmpinfo = par->matchinfo;
 
 	/* Must specify no unknown invflags */
 	return !(icmpinfo->invflags & ~IP6T_ICMP_INV);
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index a04f2b8396e..3a82f24746b 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -90,13 +90,9 @@ static bool ah_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 	       !(ahinfo->hdrres && ah->reserved);
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-ah_mt6_check(const char *tablename, const void *entry,
-             const struct xt_match *match, void *matchinfo,
-             unsigned int hook_mask)
+static bool ah_mt6_check(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_ah *ahinfo = matchinfo;
+	const struct ip6t_ah *ahinfo = par->matchinfo;
 
 	if (ahinfo->invflags & ~IP6T_AH_INV_MASK) {
 		pr_debug("ip6t_ah: unknown flags %X\n", ahinfo->invflags);
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 6951d0dacf4..673aa0a5084 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -107,13 +107,9 @@ frag_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 		 && (ntohs(fh->frag_off) & IP6_MF));
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-frag_mt6_check(const char *tablename, const void *ip,
-               const struct xt_match *match, void *matchinfo,
-               unsigned int hook_mask)
+static bool frag_mt6_check(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_frag *fraginfo = matchinfo;
+	const struct ip6t_frag *fraginfo = par->matchinfo;
 
 	if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) {
 		pr_debug("ip6t_frag: unknown flags %X\n", fraginfo->invflags);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index d3351978819..cbe8dec9744 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -160,13 +160,9 @@ hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 	return false;
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-hbh_mt6_check(const char *tablename, const void *entry,
-              const struct xt_match *match, void *matchinfo,
-              unsigned int hook_mask)
+static bool hbh_mt6_check(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_opts *optsinfo = matchinfo;
+	const struct ip6t_opts *optsinfo = par->matchinfo;
 
 	if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) {
 		pr_debug("ip6t_opts: unknown flags %X\n", optsinfo->invflags);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 6aaca511d47..14e6724d567 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -118,12 +118,9 @@ ipv6header_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 	}
 }
 
-static bool
-ipv6header_mt6_check(const char *tablename, const void *ip,
-                     const struct xt_match *match, void *matchinfo,
-                     unsigned int hook_mask)
+static bool ipv6header_mt6_check(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_ipv6header_info *info = matchinfo;
+	const struct ip6t_ipv6header_info *info = par->matchinfo;
 
 	/* invflags is 0 or 0xff in hard mode */
 	if ((!info->modeflag) && info->invflags != 0x00 &&
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
index 2803258b6d0..aafe4e66577 100644
--- a/net/ipv6/netfilter/ip6t_mh.c
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -67,13 +67,9 @@ static bool mh_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 			  !!(mhinfo->invflags & IP6T_MH_INV_TYPE));
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-mh_mt6_check(const char *tablename, const void *entry,
-             const struct xt_match *match, void *matchinfo,
-             unsigned int hook_mask)
+static bool mh_mt6_check(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_mh *mhinfo = matchinfo;
+	const struct ip6t_mh *mhinfo = par->matchinfo;
 
 	/* Must specify no unknown invflags */
 	return !(mhinfo->invflags & ~IP6T_MH_INV_MASK);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 9cf4b8a37af..356b8d6f6ba 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -186,13 +186,9 @@ static bool rt_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 	return false;
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-rt_mt6_check(const char *tablename, const void *entry,
-             const struct xt_match *match, void *matchinfo,
-             unsigned int hook_mask)
+static bool rt_mt6_check(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_rt *rtinfo = matchinfo;
+	const struct ip6t_rt *rtinfo = par->matchinfo;
 
 	if (rtinfo->invflags & ~IP6T_RT_INV_MASK) {
 		pr_debug("ip6t_rt: unknown flags %X\n", rtinfo->invflags);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d1f2fb3e8f2..817ab14f7cd 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -321,39 +321,39 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
 }
 EXPORT_SYMBOL_GPL(xt_find_revision);
 
-int xt_check_match(const struct xt_match *match, unsigned short family,
-		   unsigned int size, const char *table, unsigned int hook_mask,
-		   unsigned short proto, int inv_proto, const void *entry,
-		   void *matchinfo)
+int xt_check_match(struct xt_mtchk_param *par, u_int8_t family,
+		   unsigned int size, u_int8_t proto, bool inv_proto)
 {
-	if (XT_ALIGN(match->matchsize) != size &&
-	    match->matchsize != -1) {
+	if (XT_ALIGN(par->match->matchsize) != size &&
+	    par->match->matchsize != -1) {
 		/*
 		 * ebt_among is exempt from centralized matchsize checking
 		 * because it uses a dynamic-size data set.
 		 */
 		printk("%s_tables: %s match: invalid size %Zu != %u\n",
-		       xt_prefix[family], match->name,
-		       XT_ALIGN(match->matchsize), size);
+		       xt_prefix[family], par->match->name,
+		       XT_ALIGN(par->match->matchsize), size);
 		return -EINVAL;
 	}
-	if (match->table && strcmp(match->table, table)) {
+	if (par->match->table != NULL &&
+	    strcmp(par->match->table, par->table) != 0) {
 		printk("%s_tables: %s match: only valid in %s table, not %s\n",
-		       xt_prefix[family], match->name, match->table, table);
+		       xt_prefix[family], par->match->name,
+		       par->match->table, par->table);
 		return -EINVAL;
 	}
-	if (match->hooks && (hook_mask & ~match->hooks) != 0) {
+	if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
 		printk("%s_tables: %s match: bad hook_mask %#x/%#x\n",
-		       xt_prefix[family], match->name, hook_mask, match->hooks);
+		       xt_prefix[family], par->match->name,
+		       par->hook_mask, par->match->hooks);
 		return -EINVAL;
 	}
-	if (match->proto && (match->proto != proto || inv_proto)) {
+	if (par->match->proto && (par->match->proto != proto || inv_proto)) {
 		printk("%s_tables: %s match: only valid for protocol %u\n",
-		       xt_prefix[family], match->name, match->proto);
+		       xt_prefix[family], par->match->name, par->match->proto);
 		return -EINVAL;
 	}
-	if (match->checkentry != NULL &&
-	    !match->checkentry(table, entry, match, matchinfo, hook_mask))
+	if (par->match->checkentry != NULL && !par->match->checkentry(par))
 		return -EINVAL;
 	return 0;
 }
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 30c19b5fe90..43a36c728e5 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -92,12 +92,9 @@ connbytes_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 		return what >= sinfo->count.from;
 }
 
-static bool
-connbytes_mt_check(const char *tablename, const void *ip,
-                   const struct xt_match *match, void *matchinfo,
-                   unsigned int hook_mask)
+static bool connbytes_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_connbytes_info *sinfo = matchinfo;
+	const struct xt_connbytes_info *sinfo = par->matchinfo;
 
 	if (sinfo->what != XT_CONNBYTES_PKTS &&
 	    sinfo->what != XT_CONNBYTES_BYTES &&
@@ -109,17 +106,16 @@ connbytes_mt_check(const char *tablename, const void *ip,
 	    sinfo->direction != XT_CONNBYTES_DIR_BOTH)
 		return false;
 
-	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", match->family);
+				    "proto=%u\n", par->match->family);
 		return false;
 	}
 
 	return true;
 }
 
-static void
-connbytes_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void connbytes_mt_destroy(const struct xt_match *match, void *matchinfo)
 {
 	nf_ct_l3proto_module_put(match->family);
 }
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 8b8f70e7664..1361e9919cf 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -221,24 +221,21 @@ connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return false;
 }
 
-static bool
-connlimit_mt_check(const char *tablename, const void *ip,
-                   const struct xt_match *match, void *matchinfo,
-                   unsigned int hook_mask)
+static bool connlimit_mt_check(const struct xt_mtchk_param *par)
 {
-	struct xt_connlimit_info *info = matchinfo;
+	struct xt_connlimit_info *info = par->matchinfo;
 	unsigned int i;
 
-	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
 		printk(KERN_WARNING "cannot load conntrack support for "
-		       "address family %u\n", match->family);
+		       "address family %u\n", par->match->family);
 		return false;
 	}
 
 	/* init private data */
 	info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
 	if (info->data == NULL) {
-		nf_ct_l3proto_module_put(match->family);
+		nf_ct_l3proto_module_put(par->match->family);
 		return false;
 	}
 
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index df4f4a865a5..b935b7888a9 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -61,33 +61,27 @@ connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ((ct->mark & info->mask) == info->mark) ^ info->invert;
 }
 
-static bool
-connmark_mt_check_v0(const char *tablename, const void *ip,
-                     const struct xt_match *match, void *matchinfo,
-                     unsigned int hook_mask)
+static bool connmark_mt_check_v0(const struct xt_mtchk_param *par)
 {
-	const struct xt_connmark_info *cm = matchinfo;
+	const struct xt_connmark_info *cm = par->matchinfo;
 
 	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
 		printk(KERN_WARNING "connmark: only support 32bit mark\n");
 		return false;
 	}
-	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", match->family);
+				    "proto=%u\n", par->match->family);
 		return false;
 	}
 	return true;
 }
 
-static bool
-connmark_mt_check(const char *tablename, const void *ip,
-                  const struct xt_match *match, void *matchinfo,
-                  unsigned int hook_mask)
+static bool connmark_mt_check(const struct xt_mtchk_param *par)
 {
-	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
 		printk(KERN_WARNING "cannot load conntrack support for "
-		       "proto=%u\n", match->family);
+		       "proto=%u\n", par->match->family);
 		return false;
 	}
 	return true;
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 13a7e4eacdf..f04c46a02ce 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -278,14 +278,11 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-conntrack_mt_check(const char *tablename, const void *ip,
-                   const struct xt_match *match, void *matchinfo,
-                   unsigned int hook_mask)
+static bool conntrack_mt_check(const struct xt_mtchk_param *par)
 {
-	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", match->family);
+				    "proto=%u\n", par->match->family);
 		return false;
 	}
 	return true;
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index 7aa30bb9105..e5d3e867328 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -121,12 +121,9 @@ dccp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			   XT_DCCP_OPTION, info->flags, info->invflags);
 }
 
-static bool
-dccp_mt_check(const char *tablename, const void *inf,
-              const struct xt_match *match, void *matchinfo,
-              unsigned int hook_mask)
+static bool dccp_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_dccp_info *info = matchinfo;
+	const struct xt_dccp_info *info = par->matchinfo;
 
 	return !(info->flags & ~XT_DCCP_VALID_FLAGS)
 		&& !(info->invflags & ~XT_DCCP_VALID_FLAGS)
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 57d61206135..c3f8085460d 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -43,15 +43,12 @@ dscp_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 	return (dscp == info->dscp) ^ !!info->invert;
 }
 
-static bool
-dscp_mt_check(const char *tablename, const void *info,
-              const struct xt_match *match, void *matchinfo,
-              unsigned int hook_mask)
+static bool dscp_mt_check(const struct xt_mtchk_param *par)
 {
-	const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp;
+	const struct xt_dscp_info *info = par->matchinfo;
 
-	if (dscp > XT_DSCP_MAX) {
-		printk(KERN_ERR "xt_dscp: dscp %x out of range\n", dscp);
+	if (info->dscp > XT_DSCP_MAX) {
+		printk(KERN_ERR "xt_dscp: dscp %x out of range\n", info->dscp);
 		return false;
 	}
 
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index 6d59f2e7c1c..609439967c2 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -66,13 +66,9 @@ static bool esp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			 !!(espinfo->invflags & XT_ESP_INV_SPI));
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-esp_mt_check(const char *tablename, const void *ip_void,
-             const struct xt_match *match, void *matchinfo,
-             unsigned int hook_mask)
+static bool esp_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_esp *espinfo = matchinfo;
+	const struct xt_esp *espinfo = par->matchinfo;
 
 	if (espinfo->invflags & ~XT_ESP_INV_MASK) {
 		duprintf("xt_esp: unknown flags %X\n", espinfo->invflags);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 22a60a728cf..2f73820e46d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -664,12 +664,9 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return false;
 }
 
-static bool
-hashlimit_mt_check_v0(const char *tablename, const void *inf,
-                      const struct xt_match *match, void *matchinfo,
-                      unsigned int hook_mask)
+static bool hashlimit_mt_check_v0(const struct xt_mtchk_param *par)
 {
-	struct xt_hashlimit_info *r = matchinfo;
+	struct xt_hashlimit_info *r = par->matchinfo;
 
 	/* Check for overflow. */
 	if (r->cfg.burst == 0 ||
@@ -698,8 +695,8 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf,
 	 * the list of htable's in htable_create(), since then we would
 	 * create duplicate proc files. -HW */
 	mutex_lock(&hlimit_mutex);
-	r->hinfo = htable_find_get(r->name, match->family);
-	if (!r->hinfo && htable_create_v0(r, match->family) != 0) {
+	r->hinfo = htable_find_get(r->name, par->match->family);
+	if (!r->hinfo && htable_create_v0(r, par->match->family) != 0) {
 		mutex_unlock(&hlimit_mutex);
 		return false;
 	}
@@ -710,12 +707,9 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf,
 	return true;
 }
 
-static bool
-hashlimit_mt_check(const char *tablename, const void *inf,
-                   const struct xt_match *match, void *matchinfo,
-                   unsigned int hook_mask)
+static bool hashlimit_mt_check(const struct xt_mtchk_param *par)
 {
-	struct xt_hashlimit_mtinfo1 *info = matchinfo;
+	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 
 	/* Check for overflow. */
 	if (info->cfg.burst == 0 ||
@@ -729,7 +723,7 @@ hashlimit_mt_check(const char *tablename, const void *inf,
 		return false;
 	if (info->name[sizeof(info->name)-1] != '\0')
 		return false;
-	if (match->family == NFPROTO_IPV4) {
+	if (par->match->family == NFPROTO_IPV4) {
 		if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
 			return false;
 	} else {
@@ -744,8 +738,8 @@ hashlimit_mt_check(const char *tablename, const void *inf,
 	 * the list of htable's in htable_create(), since then we would
 	 * create duplicate proc files. -HW */
 	mutex_lock(&hlimit_mutex);
-	info->hinfo = htable_find_get(info->name, match->family);
-	if (!info->hinfo && htable_create(info, match->family) != 0) {
+	info->hinfo = htable_find_get(info->name, par->match->family);
+	if (!info->hinfo && htable_create(info, par->match->family) != 0) {
 		mutex_unlock(&hlimit_mutex);
 		return false;
 	}
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 73bdc3ba13f..86d3c332fcb 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -54,16 +54,13 @@ helper_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ret;
 }
 
-static bool
-helper_mt_check(const char *tablename, const void *inf,
-                const struct xt_match *match, void *matchinfo,
-                unsigned int hook_mask)
+static bool helper_mt_check(const struct xt_mtchk_param *par)
 {
-	struct xt_helper_info *info = matchinfo;
+	struct xt_helper_info *info = par->matchinfo;
 
-	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", match->family);
+				    "proto=%u\n", par->match->family);
 		return false;
 	}
 	info->name[29] = '\0';
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index c475eac5dbe..c908d69a559 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -92,12 +92,9 @@ user2credits(u_int32_t user)
 	return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE;
 }
 
-static bool
-limit_mt_check(const char *tablename, const void *inf,
-               const struct xt_match *match, void *matchinfo,
-               unsigned int hook_mask)
+static bool limit_mt_check(const struct xt_mtchk_param *par)
 {
-	struct xt_rateinfo *r = matchinfo;
+	struct xt_rateinfo *r = par->matchinfo;
 
 	/* Check for overflow. */
 	if (r->burst == 0
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 88547614653..10b9e34bbc5 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -38,12 +38,9 @@ mark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
 
-static bool
-mark_mt_check_v0(const char *tablename, const void *entry,
-                 const struct xt_match *match, void *matchinfo,
-                 unsigned int hook_mask)
+static bool mark_mt_check_v0(const struct xt_mtchk_param *par)
 {
-	const struct xt_mark_info *minfo = matchinfo;
+	const struct xt_mark_info *minfo = par->matchinfo;
 
 	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
 		printk(KERN_WARNING "mark: only supports 32bit mark\n");
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index 7087e291528..d06bb2dd390 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -158,50 +158,37 @@ check(u_int16_t proto,
 		&& count <= XT_MULTI_PORTS;
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-multiport_mt_check_v0(const char *tablename, const void *info,
-                      const struct xt_match *match, void *matchinfo,
-                      unsigned int hook_mask)
+static bool multiport_mt_check_v0(const struct xt_mtchk_param *par)
 {
-	const struct ipt_ip *ip = info;
-	const struct xt_multiport *multiinfo = matchinfo;
+	const struct ipt_ip *ip = par->entryinfo;
+	const struct xt_multiport *multiinfo = par->matchinfo;
 
 	return check(ip->proto, ip->invflags, multiinfo->flags,
 		     multiinfo->count);
 }
 
-static bool
-multiport_mt_check(const char *tablename, const void *info,
-                   const struct xt_match *match, void *matchinfo,
-                   unsigned int hook_mask)
+static bool multiport_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct ipt_ip *ip = info;
-	const struct xt_multiport_v1 *multiinfo = matchinfo;
+	const struct ipt_ip *ip = par->entryinfo;
+	const struct xt_multiport_v1 *multiinfo = par->matchinfo;
 
 	return check(ip->proto, ip->invflags, multiinfo->flags,
 		     multiinfo->count);
 }
 
-static bool
-multiport_mt6_check_v0(const char *tablename, const void *info,
-                       const struct xt_match *match, void *matchinfo,
-                       unsigned int hook_mask)
+static bool multiport_mt6_check_v0(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_ip6 *ip = info;
-	const struct xt_multiport *multiinfo = matchinfo;
+	const struct ip6t_ip6 *ip = par->entryinfo;
+	const struct xt_multiport *multiinfo = par->matchinfo;
 
 	return check(ip->proto, ip->invflags, multiinfo->flags,
 		     multiinfo->count);
 }
 
-static bool
-multiport_mt6_check(const char *tablename, const void *info,
-                    const struct xt_match *match, void *matchinfo,
-                    unsigned int hook_mask)
+static bool multiport_mt6_check(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_ip6 *ip = info;
-	const struct xt_multiport_v1 *multiinfo = matchinfo;
+	const struct ip6t_ip6 *ip = par->entryinfo;
+	const struct xt_multiport_v1 *multiinfo = par->matchinfo;
 
 	return check(ip->proto, ip->invflags, multiinfo->flags,
 		     multiinfo->count);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 493b5eb8d14..32f84e84d9e 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -107,12 +107,9 @@ owner_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-owner_mt_check_v0(const char *tablename, const void *ip,
-                  const struct xt_match *match, void *matchinfo,
-                  unsigned int hook_mask)
+static bool owner_mt_check_v0(const struct xt_mtchk_param *par)
 {
-	const struct ipt_owner_info *info = matchinfo;
+	const struct ipt_owner_info *info = par->matchinfo;
 
 	if (info->match & (IPT_OWNER_PID | IPT_OWNER_SID | IPT_OWNER_COMM)) {
 		printk(KERN_WARNING KBUILD_MODNAME
@@ -124,12 +121,9 @@ owner_mt_check_v0(const char *tablename, const void *ip,
 	return true;
 }
 
-static bool
-owner_mt6_check_v0(const char *tablename, const void *ip,
-                   const struct xt_match *match, void *matchinfo,
-                   unsigned int hook_mask)
+static bool owner_mt6_check_v0(const struct xt_mtchk_param *par)
 {
-	const struct ip6t_owner_info *info = matchinfo;
+	const struct ip6t_owner_info *info = par->matchinfo;
 
 	if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) {
 		printk(KERN_WARNING KBUILD_MODNAME
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index e980e179d4f..b01786d2dd9 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -91,12 +91,9 @@ match_outdev:
 	return ret ^ !(info->invert & XT_PHYSDEV_OP_OUT);
 }
 
-static bool
-physdev_mt_check(const char *tablename, const void *ip,
-                 const struct xt_match *match, void *matchinfo,
-                 unsigned int hook_mask)
+static bool physdev_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_physdev_info *info = matchinfo;
+	const struct xt_physdev_info *info = par->matchinfo;
 
 	if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
 	    info->bitmask & ~XT_PHYSDEV_OP_MASK)
@@ -104,12 +101,12 @@ physdev_mt_check(const char *tablename, const void *ip,
 	if (info->bitmask & XT_PHYSDEV_OP_OUT &&
 	    (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
 	     info->invert & XT_PHYSDEV_OP_BRIDGED) &&
-	    hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
-			 (1 << NF_INET_POST_ROUTING))) {
+	    par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
+	    (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
 		printk(KERN_WARNING "physdev match: using --physdev-out in the "
 		       "OUTPUT, FORWARD and POSTROUTING chains for non-bridged "
 		       "traffic is not supported anymore.\n");
-		if (hook_mask & (1 << NF_INET_LOCAL_OUT))
+		if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
 			return false;
 	}
 	return true;
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index b0a00fb0511..328bd20ddd2 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -128,26 +128,23 @@ policy_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ret;
 }
 
-static bool
-policy_mt_check(const char *tablename, const void *ip_void,
-                const struct xt_match *match, void *matchinfo,
-                unsigned int hook_mask)
+static bool policy_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_policy_info *info = matchinfo;
+	const struct xt_policy_info *info = par->matchinfo;
 
 	if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) {
 		printk(KERN_ERR "xt_policy: neither incoming nor "
 				"outgoing policy selected\n");
 		return false;
 	}
-	if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN)
-	    && info->flags & XT_POLICY_MATCH_OUT) {
+	if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
+	    (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) {
 		printk(KERN_ERR "xt_policy: output policy not valid in "
 				"PRE_ROUTING and INPUT\n");
 		return false;
 	}
-	if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT)
-	    && info->flags & XT_POLICY_MATCH_IN) {
+	if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
+	    (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) {
 		printk(KERN_ERR "xt_policy: input policy not valid in "
 				"POST_ROUTING and OUTPUT\n");
 		return false;
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 3ab92666c14..c84fce5e0f3 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -37,12 +37,9 @@ quota_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ret;
 }
 
-static bool
-quota_mt_check(const char *tablename, const void *entry,
-               const struct xt_match *match, void *matchinfo,
-               unsigned int hook_mask)
+static bool quota_mt_check(const struct xt_mtchk_param *par)
 {
-	struct xt_quota_info *q = matchinfo;
+	struct xt_quota_info *q = par->matchinfo;
 
 	if (q->flags & ~XT_QUOTA_MASK)
 		return false;
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index e9f64ef4565..4b05ce168a7 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -74,13 +74,9 @@ xt_rateest_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ret;
 }
 
-static bool xt_rateest_mt_checkentry(const char *tablename,
-				     const void *ip,
-				     const struct xt_match *match,
-				     void *matchinfo,
-				     unsigned int hook_mask)
+static bool xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
 {
-	struct xt_rateest_match_info *info = matchinfo;
+	struct xt_rateest_match_info *info = par->matchinfo;
 	struct xt_rateest *est1, *est2;
 
 	if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS |
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index baeb90a5623..a512b49f3fe 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -280,12 +280,9 @@ out:
 	return ret;
 }
 
-static bool
-recent_mt_check(const char *tablename, const void *ip,
-                const struct xt_match *match, void *matchinfo,
-                unsigned int hook_mask)
+static bool recent_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_recent_mtinfo *info = matchinfo;
+	const struct xt_recent_mtinfo *info = par->matchinfo;
 	struct recent_table *t;
 	unsigned i;
 	bool ret = false;
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index b0014ab65da..e223cb43ae8 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -147,12 +147,9 @@ sctp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			   XT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
 }
 
-static bool
-sctp_mt_check(const char *tablename, const void *inf,
-              const struct xt_match *match, void *matchinfo,
-              unsigned int hook_mask)
+static bool sctp_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_sctp_info *info = matchinfo;
+	const struct xt_sctp_info *info = par->matchinfo;
 
 	return !(info->flags & ~XT_SCTP_VALID_FLAGS)
 		&& !(info->invflags & ~XT_SCTP_VALID_FLAGS)
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index 29f5a8a1b02..88b1235519d 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -37,14 +37,11 @@ state_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return (sinfo->statemask & statebit);
 }
 
-static bool
-state_mt_check(const char *tablename, const void *inf,
-               const struct xt_match *match, void *matchinfo,
-               unsigned int hook_mask)
+static bool state_mt_check(const struct xt_mtchk_param *par)
 {
-	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", match->family);
+				    "proto=%u\n", par->match->family);
 		return false;
 	}
 	return true;
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index dcadc491db2..0d75141139d 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -49,12 +49,9 @@ statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return ret;
 }
 
-static bool
-statistic_mt_check(const char *tablename, const void *entry,
-                   const struct xt_match *match, void *matchinfo,
-                   unsigned int hook_mask)
+static bool statistic_mt_check(const struct xt_mtchk_param *par)
 {
-	struct xt_statistic_info *info = matchinfo;
+	struct xt_statistic_info *info = par->matchinfo;
 
 	if (info->mode > XT_STATISTIC_MODE_MAX ||
 	    info->flags & ~XT_STATISTIC_MASK)
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 33f2d29ca4f..c9407aa78f7 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -40,12 +40,9 @@ string_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 
 #define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m))
 
-static bool
-string_mt_check(const char *tablename, const void *ip,
-                const struct xt_match *match, void *matchinfo,
-                unsigned int hook_mask)
+static bool string_mt_check(const struct xt_mtchk_param *par)
 {
-	struct xt_string_info *conf = matchinfo;
+	struct xt_string_info *conf = par->matchinfo;
 	struct ts_config *ts_conf;
 	int flags = TS_AUTOLOAD;
 
@@ -56,7 +53,7 @@ string_mt_check(const char *tablename, const void *ip,
 		return false;
 	if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE)
 		return false;
-	if (match->revision == 1) {
+	if (par->match->revision == 1) {
 		if (conf->u.v1.flags &
 		    ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT))
 			return false;
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 66cf71b1d59..1ebdc4934ee 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -126,13 +126,9 @@ static bool tcp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-tcp_mt_check(const char *tablename, const void *info,
-             const struct xt_match *match, void *matchinfo,
-             unsigned int hook_mask)
+static bool tcp_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_tcp *tcpinfo = matchinfo;
+	const struct xt_tcp *tcpinfo = par->matchinfo;
 
 	/* Must specify no unknown invflags */
 	return !(tcpinfo->invflags & ~XT_TCP_INV_MASK);
@@ -165,13 +161,9 @@ static bool udp_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			      !!(udpinfo->invflags & XT_UDP_INV_DSTPT));
 }
 
-/* Called when user tries to insert an entry of this type. */
-static bool
-udp_mt_check(const char *tablename, const void *info,
-             const struct xt_match *match, void *matchinfo,
-             unsigned int hook_mask)
+static bool udp_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_udp *udpinfo = matchinfo;
+	const struct xt_udp *udpinfo = par->matchinfo;
 
 	/* Must specify no unknown invflags */
 	return !(udpinfo->invflags & ~XT_UDP_INV_MASK);
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 28599d3979c..29375ba8db7 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -218,12 +218,9 @@ time_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static bool
-time_mt_check(const char *tablename, const void *ip,
-              const struct xt_match *match, void *matchinfo,
-              unsigned int hook_mask)
+static bool time_mt_check(const struct xt_mtchk_param *par)
 {
-	const struct xt_time_info *info = matchinfo;
+	const struct xt_time_info *info = par->matchinfo;
 
 	if (info->daytime_start > XT_TIME_MAX_DAYTIME ||
 	    info->daytime_stop > XT_TIME_MAX_DAYTIME) {
-- 
cgit v1.2.3


From 6be3d8598e883fb632edf059ba2f8d1b9f4da138 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:19 +0200
Subject: netfilter: xtables: move extension arguments into compound structure
 (3/6)

This patch does this for match extensions' destroy functions.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebtables.c | 20 ++++++++++++--------
 net/ipv4/netfilter/ip_tables.c  | 10 +++++++---
 net/ipv6/netfilter/ip6_tables.c | 10 +++++++---
 net/netfilter/xt_connbytes.c    |  4 ++--
 net/netfilter/xt_connlimit.c    |  7 +++----
 net/netfilter/xt_connmark.c     |  5 ++---
 net/netfilter/xt_conntrack.c    |  5 ++---
 net/netfilter/xt_hashlimit.c    |  9 ++++-----
 net/netfilter/xt_helper.c       |  4 ++--
 net/netfilter/xt_rateest.c      |  5 ++---
 net/netfilter/xt_recent.c       |  4 ++--
 net/netfilter/xt_state.c        |  4 ++--
 net/netfilter/xt_string.c       |  4 ++--
 13 files changed, 49 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 5ce37b2f5b8..0320b520362 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -558,12 +558,16 @@ ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo,
 static inline int
 ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i)
 {
+	struct xt_mtdtor_param par;
+
 	if (i && (*i)-- == 0)
 		return 1;
-	if (m->u.match->destroy)
-		m->u.match->destroy(m->u.match, m->data);
-	module_put(m->u.match->me);
 
+	par.match     = m->u.match;
+	par.matchinfo = m->data;
+	if (par.match->destroy != NULL)
+		par.match->destroy(&par);
+	module_put(par.match->me);
 	return 0;
 }
 
@@ -609,7 +613,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	unsigned int i, j, hook = 0, hookmask = 0;
 	size_t gap;
 	int ret;
-	struct xt_mtchk_param par;
+	struct xt_mtchk_param mtpar;
 
 	/* don't mess with the struct ebt_entries */
 	if (e->bitmask == 0)
@@ -651,10 +655,10 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	}
 	i = 0;
 
-	par.table     = name;
-	par.entryinfo = e;
-	par.hook_mask = hookmask;
-	ret = EBT_MATCH_ITERATE(e, ebt_check_match, &par, &i);
+	mtpar.table     = name;
+	mtpar.entryinfo = e;
+	mtpar.hook_mask = hookmask;
+	ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i);
 	if (ret != 0)
 		goto cleanup_matches;
 	j = 0;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4147298a6a8..12ad4d5c55d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -576,12 +576,16 @@ mark_source_chains(struct xt_table_info *newinfo,
 static int
 cleanup_match(struct ipt_entry_match *m, unsigned int *i)
 {
+	struct xt_mtdtor_param par;
+
 	if (i && (*i)-- == 0)
 		return 1;
 
-	if (m->u.kernel.match->destroy)
-		m->u.kernel.match->destroy(m->u.kernel.match, m->data);
-	module_put(m->u.kernel.match->me);
+	par.match     = m->u.kernel.match;
+	par.matchinfo = m->data;
+	if (par.match->destroy != NULL)
+		par.match->destroy(&par);
+	module_put(par.match->me);
 	return 0;
 }
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 9c843e3777b..891358e89a2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -599,12 +599,16 @@ mark_source_chains(struct xt_table_info *newinfo,
 static int
 cleanup_match(struct ip6t_entry_match *m, unsigned int *i)
 {
+	struct xt_mtdtor_param par;
+
 	if (i && (*i)-- == 0)
 		return 1;
 
-	if (m->u.kernel.match->destroy)
-		m->u.kernel.match->destroy(m->u.kernel.match, m->data);
-	module_put(m->u.kernel.match->me);
+	par.match     = m->u.kernel.match;
+	par.matchinfo = m->data;
+	if (par.match->destroy != NULL)
+		par.match->destroy(&par);
+	module_put(par.match->me);
 	return 0;
 }
 
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 43a36c728e5..5bf4aa08b0f 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -115,9 +115,9 @@ static bool connbytes_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static void connbytes_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(match->family);
+	nf_ct_l3proto_module_put(par->match->family);
 }
 
 static struct xt_match connbytes_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 1361e9919cf..bfb3ee6c712 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -246,16 +246,15 @@ static bool connlimit_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static void
-connlimit_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	const struct xt_connlimit_info *info = matchinfo;
+	const struct xt_connlimit_info *info = par->matchinfo;
 	struct xt_connlimit_conn *conn;
 	struct xt_connlimit_conn *tmp;
 	struct list_head *hash = info->data->iphash;
 	unsigned int i;
 
-	nf_ct_l3proto_module_put(match->family);
+	nf_ct_l3proto_module_put(par->match->family);
 
 	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) {
 		list_for_each_entry_safe(conn, tmp, &hash[i], list) {
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index b935b7888a9..c708577ea1b 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -87,10 +87,9 @@ static bool connmark_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static void
-connmark_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(match->family);
+	nf_ct_l3proto_module_put(par->match->family);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index f04c46a02ce..5cd58d7fcb1 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -288,10 +288,9 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static void
-conntrack_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(match->family);
+	nf_ct_l3proto_module_put(par->match->family);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 2f73820e46d..6fc4292d46e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -748,17 +748,16 @@ static bool hashlimit_mt_check(const struct xt_mtchk_param *par)
 }
 
 static void
-hashlimit_mt_destroy_v0(const struct xt_match *match, void *matchinfo)
+hashlimit_mt_destroy_v0(const struct xt_mtdtor_param *par)
 {
-	const struct xt_hashlimit_info *r = matchinfo;
+	const struct xt_hashlimit_info *r = par->matchinfo;
 
 	htable_put(r->hinfo);
 }
 
-static void
-hashlimit_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	const struct xt_hashlimit_mtinfo1 *info = matchinfo;
+	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 
 	htable_put(info->hinfo);
 }
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 86d3c332fcb..280c984349f 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -67,9 +67,9 @@ static bool helper_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static void helper_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void helper_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(match->family);
+	nf_ct_l3proto_module_put(par->match->family);
 }
 
 static struct xt_match helper_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 4b05ce168a7..220a1d588ee 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -117,10 +117,9 @@ err1:
 	return false;
 }
 
-static void xt_rateest_mt_destroy(const struct xt_match *match,
-				  void *matchinfo)
+static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	struct xt_rateest_match_info *info = matchinfo;
+	struct xt_rateest_match_info *info = par->matchinfo;
 
 	xt_rateest_put(info->est1);
 	if (info->est2)
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index a512b49f3fe..4ebd4ca9a99 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -349,9 +349,9 @@ out:
 	return ret;
 }
 
-static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void recent_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	const struct xt_recent_mtinfo *info = matchinfo;
+	const struct xt_recent_mtinfo *info = par->matchinfo;
 	struct recent_table *t;
 
 	mutex_lock(&recent_mutex);
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index 88b1235519d..4c946cbd731 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -47,9 +47,9 @@ static bool state_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static void state_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void state_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(match->family);
+	nf_ct_l3proto_module_put(par->match->family);
 }
 
 static struct xt_match state_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index c9407aa78f7..b4d77411131 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -70,9 +70,9 @@ static bool string_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static void string_mt_destroy(const struct xt_match *match, void *matchinfo)
+static void string_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+	textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config);
 }
 
 static struct xt_match xt_string_mt_reg[] __read_mostly = {
-- 
cgit v1.2.3


From 7eb3558655aaa87a3e71a0c065dfaddda521fa6d Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:19 +0200
Subject: netfilter: xtables: move extension arguments into compound structure
 (4/6)

This patch does this for target extensions' target functions.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_arpreply.c |  8 +++-----
 net/bridge/netfilter/ebt_dnat.c     |  6 ++----
 net/bridge/netfilter/ebt_log.c      | 14 ++++++--------
 net/bridge/netfilter/ebt_mark.c     |  6 ++----
 net/bridge/netfilter/ebt_nflog.c    |  9 ++++-----
 net/bridge/netfilter/ebt_redirect.c | 12 +++++-------
 net/bridge/netfilter/ebt_snat.c     |  6 ++----
 net/bridge/netfilter/ebt_ulog.c     |  9 +++------
 net/bridge/netfilter/ebtables.c     | 27 ++++++++++++++++-----------
 net/ipv4/netfilter/arp_tables.c     | 23 ++++++++++++-----------
 net/ipv4/netfilter/arpt_mangle.c    |  7 ++-----
 net/ipv4/netfilter/ip_tables.c      | 24 ++++++++++--------------
 net/ipv4/netfilter/ipt_CLUSTERIP.c  |  6 ++----
 net/ipv4/netfilter/ipt_ECN.c        |  6 ++----
 net/ipv4/netfilter/ipt_LOG.c        |  8 +++-----
 net/ipv4/netfilter/ipt_MASQUERADE.c | 14 ++++++--------
 net/ipv4/netfilter/ipt_NETMAP.c     | 17 ++++++++---------
 net/ipv4/netfilter/ipt_REDIRECT.c   | 12 +++++-------
 net/ipv4/netfilter/ipt_REJECT.c     |  8 +++-----
 net/ipv4/netfilter/ipt_TTL.c        |  6 ++----
 net/ipv4/netfilter/ipt_ULOG.c       | 10 +++-------
 net/ipv4/netfilter/nf_nat_rule.c    | 32 ++++++++++++--------------------
 net/ipv6/netfilter/ip6_tables.c     | 24 +++++++++++-------------
 net/ipv6/netfilter/ip6t_HL.c        |  6 ++----
 net/ipv6/netfilter/ip6t_LOG.c       |  8 +++-----
 net/ipv6/netfilter/ip6t_REJECT.c    | 18 ++++++++----------
 net/netfilter/xt_CLASSIFY.c         |  6 ++----
 net/netfilter/xt_CONNMARK.c         | 12 ++++--------
 net/netfilter/xt_CONNSECMARK.c      |  6 ++----
 net/netfilter/xt_DSCP.c             | 30 ++++++++++--------------------
 net/netfilter/xt_MARK.c             | 18 ++++++------------
 net/netfilter/xt_NFLOG.c            | 10 ++++------
 net/netfilter/xt_NFQUEUE.c          |  6 ++----
 net/netfilter/xt_NOTRACK.c          |  4 +---
 net/netfilter/xt_RATEEST.c          |  9 ++-------
 net/netfilter/xt_SECMARK.c          |  6 ++----
 net/netfilter/xt_TCPMSS.c           | 12 ++++--------
 net/netfilter/xt_TCPOPTSTRIP.c      | 12 ++++--------
 net/netfilter/xt_TPROXY.c           | 11 +++--------
 net/netfilter/xt_TRACE.c            |  4 +---
 net/sched/act_ipt.c                 | 12 ++++++++----
 41 files changed, 192 insertions(+), 292 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index baf5510d044..fc94699f719 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -16,11 +16,9 @@
 #include <linux/netfilter_bridge/ebt_arpreply.h>
 
 static unsigned int
-ebt_arpreply_tg(struct sk_buff *skb, const struct net_device *in,
-		const struct net_device *out, unsigned int hook_nr,
-		const struct xt_target *target, const void *data)
+ebt_arpreply_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_arpreply_info *info = data;
+	const struct ebt_arpreply_info *info = par->targinfo;
 	const __be32 *siptr, *diptr;
 	__be32 _sip, _dip;
 	const struct arphdr *ap;
@@ -53,7 +51,7 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct net_device *in,
 	if (diptr == NULL)
 		return EBT_DROP;
 
-	arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)in,
+	arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in,
 		 *diptr, shp, info->mac, shp);
 
 	return info->target;
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index cb80101e412..bb5d79e0bee 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -15,11 +15,9 @@
 #include <linux/netfilter_bridge/ebt_nat.h>
 
 static unsigned int
-ebt_dnat_tg(struct sk_buff *skb, const struct net_device *in,
-	    const struct net_device *out, unsigned int hook_nr,
-	    const struct xt_target *target, const void *data)
+ebt_dnat_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_nat_info *info = data;
+	const struct ebt_nat_info *info = par->targinfo;
 
 	if (!skb_make_writable(skb, 0))
 		return EBT_DROP;
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index b40f9ed4c34..87de5fccb2f 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -195,11 +195,9 @@ out:
 }
 
 static unsigned int
-ebt_log_tg(struct sk_buff *skb, const struct net_device *in,
-	   const struct net_device *out, unsigned int hooknr,
-	   const struct xt_target *target, const void *data)
+ebt_log_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_log_info *info = data;
+	const struct ebt_log_info *info = par->targinfo;
 	struct nf_loginfo li;
 
 	li.type = NF_LOG_TYPE_LOG;
@@ -207,11 +205,11 @@ ebt_log_tg(struct sk_buff *skb, const struct net_device *in,
 	li.u.log.logflags = info->bitmask;
 
 	if (info->bitmask & EBT_LOG_NFLOG)
-		nf_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li,
-			      "%s", info->prefix);
+		nf_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
+		              par->out, &li, "%s", info->prefix);
 	else
-		ebt_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li,
-			       info->prefix);
+		ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
+		               par->out, &li, info->prefix);
 	return EBT_CONTINUE;
 }
 
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index dff19fc91cf..aafc456c3c3 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -19,11 +19,9 @@
 #include <linux/netfilter_bridge/ebt_mark_t.h>
 
 static unsigned int
-ebt_mark_tg(struct sk_buff *skb, const struct net_device *in,
-	    const struct net_device *out, unsigned int hook_nr,
-	    const struct xt_target *target, const void *data)
+ebt_mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_mark_t_info *info = data;
+	const struct ebt_mark_t_info *info = par->targinfo;
 	int action = info->target & -16;
 
 	if (action == MARK_SET_VALUE)
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 74b4fa0aabc..6a28d994cf7 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -20,11 +20,9 @@
 #include <net/netfilter/nf_log.h>
 
 static unsigned int
-ebt_nflog_tg(struct sk_buff *skb, const struct net_device *in,
-	     const struct net_device *out, unsigned int hooknr,
-	     const struct xt_target *target, const void *data)
+ebt_nflog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_nflog_info *info = data;
+	const struct ebt_nflog_info *info = par->targinfo;
 	struct nf_loginfo li;
 
 	li.type = NF_LOG_TYPE_ULOG;
@@ -32,7 +30,8 @@ ebt_nflog_tg(struct sk_buff *skb, const struct net_device *in,
 	li.u.ulog.group = info->group;
 	li.u.ulog.qthreshold = info->threshold;
 
-	nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, "%s", info->prefix);
+	nf_log_packet(PF_BRIDGE, par->hooknum, skb, par->in, par->out,
+	              &li, "%s", info->prefix);
 	return EBT_CONTINUE;
 }
 
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index a50ffbe0e4f..0cfe2fad940 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -16,20 +16,18 @@
 #include <linux/netfilter_bridge/ebt_redirect.h>
 
 static unsigned int
-ebt_redirect_tg(struct sk_buff *skb, const struct net_device *in,
-		const struct net_device *out, unsigned int hooknr,
-		const struct xt_target *target, const void *data)
+ebt_redirect_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_redirect_info *info = data;
+	const struct ebt_redirect_info *info = par->targinfo;
 
 	if (!skb_make_writable(skb, 0))
 		return EBT_DROP;
 
-	if (hooknr != NF_BR_BROUTING)
+	if (par->hooknum != NF_BR_BROUTING)
 		memcpy(eth_hdr(skb)->h_dest,
-		       in->br_port->br->dev->dev_addr, ETH_ALEN);
+		       par->in->br_port->br->dev->dev_addr, ETH_ALEN);
 	else
-		memcpy(eth_hdr(skb)->h_dest, in->dev_addr, ETH_ALEN);
+		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
 	skb->pkt_type = PACKET_HOST;
 	return info->target;
 }
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 8a55c7d49b5..f55960eee99 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -17,11 +17,9 @@
 #include <linux/netfilter_bridge/ebt_nat.h>
 
 static unsigned int
-ebt_snat_tg(struct sk_buff *skb, const struct net_device *in,
-	    const struct net_device *out, unsigned int hook_nr,
-	    const struct xt_target *target, const void *data)
+ebt_snat_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_nat_info *info = data;
+	const struct ebt_nat_info *info = par->targinfo;
 
 	if (!skb_make_writable(skb, 0))
 		return EBT_DROP;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 25ca6467349..bfedf12cbf4 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -247,13 +247,10 @@ static void ebt_log_packet(u_int8_t pf, unsigned int hooknum,
 }
 
 static unsigned int
-ebt_ulog_tg(struct sk_buff *skb, const struct net_device *in,
-	    const struct net_device *out, unsigned int hooknr,
-	    const struct xt_target *target, const void *data)
+ebt_ulog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ebt_ulog_info *uloginfo = data;
-
-	ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
+	ebt_ulog_packet(par->hooknum, skb, par->in, par->out,
+	                par->targinfo, NULL);
 	return EBT_CONTINUE;
 }
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0320b520362..a1156bab4a0 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -64,11 +64,13 @@ static struct xt_target ebt_standard_target = {
 	.targetsize = sizeof(int),
 };
 
-static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
-   struct sk_buff *skb, unsigned int hooknr, const struct net_device *in,
-   const struct net_device *out)
+static inline int
+ebt_do_watcher(const struct ebt_entry_watcher *w, struct sk_buff *skb,
+	       struct xt_target_param *par)
 {
-	w->u.watcher->target(skb, in, out, hooknr, w->u.watcher, w->data);
+	par->target   = w->u.watcher;
+	par->targinfo = w->data;
+	w->u.watcher->target(skb, par);
 	/* watchers don't give a verdict */
 	return 0;
 }
@@ -156,10 +158,12 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 	struct ebt_table_info *private;
 	bool hotdrop = false;
 	struct xt_match_param mtpar;
+	struct xt_target_param tgpar;
 
-	mtpar.in      = in;
-	mtpar.out     = out;
+	mtpar.in      = tgpar.in  = in;
+	mtpar.out     = tgpar.out = out;
 	mtpar.hotdrop = &hotdrop;
+	tgpar.hooknum = hook;
 
 	read_lock_bh(&table->lock);
 	private = table->private;
@@ -193,17 +197,18 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 
 		/* these should only watch: not modify, nor tell us
 		   what to do with the packet */
-		EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, hook, in,
-		   out);
+		EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &tgpar);
 
 		t = (struct ebt_entry_target *)
 		   (((char *)point) + point->target_offset);
 		/* standard target */
 		if (!t->u.target->target)
 			verdict = ((struct ebt_standard_target *)t)->verdict;
-		else
-			verdict = t->u.target->target(skb, in, out, hook,
-				  t->u.target, t->data);
+		else {
+			tgpar.target   = t->u.target;
+			tgpar.targinfo = t->data;
+			verdict = t->u.target->target(skb, &tgpar);
+		}
 		if (verdict == EBT_ACCEPT) {
 			read_unlock_bh(&table->lock);
 			return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ae525a9afbe..5b631ad74b5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -200,15 +200,12 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
 	return 1;
 }
 
-static unsigned int arpt_error(struct sk_buff *skb,
-			       const struct net_device *in,
-			       const struct net_device *out,
-			       unsigned int hooknum,
-			       const struct xt_target *target,
-			       const void *targinfo)
+static unsigned int
+arpt_error(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	if (net_ratelimit())
-		printk("arp_tables: error: '%s'\n", (char *)targinfo);
+		printk("arp_tables: error: '%s'\n",
+		       (const char *)par->targinfo);
 
 	return NF_DROP;
 }
@@ -232,6 +229,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	const char *indev, *outdev;
 	void *table_base;
 	const struct xt_table_info *private;
+	struct xt_target_param tgpar;
 
 	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
 		return NF_DROP;
@@ -245,6 +243,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
+	tgpar.in      = in;
+	tgpar.out     = out;
+	tgpar.hooknum = hook;
+
 	arp = arp_hdr(skb);
 	do {
 		if (arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
@@ -290,11 +292,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 				/* Targets which reenter must return
 				 * abs. verdicts
 				 */
+				tgpar.target   = t->u.kernel.target;
+				tgpar.targinfo = t->data;
 				verdict = t->u.kernel.target->target(skb,
-								     in, out,
-								     hook,
-								     t->u.kernel.target,
-								     t->data);
+								     &tgpar);
 
 				/* Target might have changed stuff. */
 				arp = arp_hdr(skb);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 3f9e4ccd616..0bf81b35369 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -9,12 +9,9 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
 MODULE_DESCRIPTION("arptables arp payload mangle target");
 
 static unsigned int
-target(struct sk_buff *skb,
-       const struct net_device *in, const struct net_device *out,
-       unsigned int hooknum, const struct xt_target *target,
-       const void *targinfo)
+target(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct arpt_mangle *mangle = targinfo;
+	const struct arpt_mangle *mangle = par->targinfo;
 	const struct arphdr *arp;
 	unsigned char *arpptr;
 	int pln, hln;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 12ad4d5c55d..0f8ecf39022 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -171,15 +171,11 @@ ip_checkentry(const struct ipt_ip *ip)
 }
 
 static unsigned int
-ipt_error(struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  unsigned int hooknum,
-	  const struct xt_target *target,
-	  const void *targinfo)
+ipt_error(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	if (net_ratelimit())
-		printk("ip_tables: error: `%s'\n", (char *)targinfo);
+		printk("ip_tables: error: `%s'\n",
+		       (const char *)par->targinfo);
 
 	return NF_DROP;
 }
@@ -334,6 +330,7 @@ ipt_do_table(struct sk_buff *skb,
 	struct ipt_entry *e, *back;
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
+	struct xt_target_param tgpar;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -349,8 +346,9 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
 	mtpar.thoff   = ip_hdrlen(skb);
 	mtpar.hotdrop = &hotdrop;
-	mtpar.in      = in;
-	mtpar.out     = out;
+	mtpar.in      = tgpar.in  = in;
+	mtpar.out     = tgpar.out = out;
+	tgpar.hooknum = hook;
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
@@ -414,16 +412,14 @@ ipt_do_table(struct sk_buff *skb,
 			} else {
 				/* Targets which reenter must return
 				   abs. verdicts */
+				tgpar.target   = t->u.kernel.target;
+				tgpar.targinfo = t->data;
 #ifdef CONFIG_NETFILTER_DEBUG
 				((struct ipt_entry *)table_base)->comefrom
 					= 0xeeeeeeec;
 #endif
 				verdict = t->u.kernel.target->target(skb,
-								     in, out,
-								     hook,
-								     t->u.kernel.target,
-								     t->data);
-
+								     &tgpar);
 #ifdef CONFIG_NETFILTER_DEBUG
 				if (((struct ipt_entry *)table_base)->comefrom
 				    != 0xeeeeeeec
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 63faddc18a1..67e8aa8f34f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -281,11 +281,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
  ***********************************************************************/
 
 static unsigned int
-clusterip_tg(struct sk_buff *skb, const struct net_device *in,
-             const struct net_device *out, unsigned int hooknum,
-             const struct xt_target *target, const void *targinfo)
+clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+	const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	u_int32_t hash;
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index aee2364afff..e37f181e829 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -77,11 +77,9 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
 }
 
 static unsigned int
-ecn_tg(struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, unsigned int hooknum,
-       const struct xt_target *target, const void *targinfo)
+ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ipt_ECN_info *einfo = targinfo;
+	const struct ipt_ECN_info *einfo = par->targinfo;
 
 	if (einfo->operation & IPT_ECN_OP_SET_IP)
 		if (!set_ect_ip(skb, einfo))
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 1c9785df4df..e9942aed35a 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -426,18 +426,16 @@ ipt_log_packet(u_int8_t pf,
 }
 
 static unsigned int
-log_tg(struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, unsigned int hooknum,
-       const struct xt_target *target, const void *targinfo)
+log_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ipt_log_info *loginfo = targinfo;
+	const struct ipt_log_info *loginfo = par->targinfo;
 	struct nf_loginfo li;
 
 	li.type = NF_LOG_TYPE_LOG;
 	li.u.log.level = loginfo->level;
 	li.u.log.logflags = loginfo->logflags;
 
-	ipt_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, &li,
+	ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li,
 		       loginfo->prefix);
 	return XT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 65c811b27b7..e0d9d49b79e 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -50,9 +50,7 @@ masquerade_tg_check(const char *tablename, const void *e,
 }
 
 static unsigned int
-masquerade_tg(struct sk_buff *skb, const struct net_device *in,
-              const struct net_device *out, unsigned int hooknum,
-              const struct xt_target *target, const void *targinfo)
+masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct nf_conn *ct;
 	struct nf_conn_nat *nat;
@@ -62,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in,
 	const struct rtable *rt;
 	__be32 newsrc;
 
-	NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
+	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
 
 	ct = nf_ct_get(skb, &ctinfo);
 	nat = nfct_nat(ct);
@@ -76,16 +74,16 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in,
 	if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
 		return NF_ACCEPT;
 
-	mr = targinfo;
+	mr = par->targinfo;
 	rt = skb->rtable;
-	newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+	newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
 	if (!newsrc) {
-		printk("MASQUERADE: %s ate my IP address\n", out->name);
+		printk("MASQUERADE: %s ate my IP address\n", par->out->name);
 		return NF_DROP;
 	}
 
 	write_lock_bh(&masq_lock);
-	nat->masq_index = out->ifindex;
+	nat->masq_index = par->out->ifindex;
 	write_unlock_bh(&masq_lock);
 
 	/* Transfer from original range. */
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index f281500bd7f..cf18f23b346 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -41,24 +41,23 @@ netmap_tg_check(const char *tablename, const void *e,
 }
 
 static unsigned int
-netmap_tg(struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, unsigned int hooknum,
-          const struct xt_target *target, const void *targinfo)
+netmap_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	__be32 new_ip, netmask;
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 	struct nf_nat_range newrange;
 
-	NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING
-		     || hooknum == NF_INET_POST_ROUTING
-		     || hooknum == NF_INET_LOCAL_OUT);
+	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+		     par->hooknum == NF_INET_POST_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_OUT);
 	ct = nf_ct_get(skb, &ctinfo);
 
 	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
 
-	if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT)
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_OUT)
 		new_ip = ip_hdr(skb)->daddr & ~netmask;
 	else
 		new_ip = ip_hdr(skb)->saddr & ~netmask;
@@ -70,7 +69,7 @@ netmap_tg(struct sk_buff *skb, const struct net_device *in,
 		  mr->range[0].min, mr->range[0].max });
 
 	/* Hand modified range to generic setup. */
-	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum));
+	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
 }
 
 static struct xt_target netmap_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index ef496105eae..23adb09ddfb 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -45,24 +45,22 @@ redirect_tg_check(const char *tablename, const void *e,
 }
 
 static unsigned int
-redirect_tg(struct sk_buff *skb, const struct net_device *in,
-            const struct net_device *out, unsigned int hooknum,
-            const struct xt_target *target, const void *targinfo)
+redirect_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	__be32 newdst;
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 	struct nf_nat_range newrange;
 
-	NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING
-		     || hooknum == NF_INET_LOCAL_OUT);
+	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_OUT);
 
 	ct = nf_ct_get(skb, &ctinfo);
 	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
 
 	/* Local packets: make them go to loopback */
-	if (hooknum == NF_INET_LOCAL_OUT)
+	if (par->hooknum == NF_INET_LOCAL_OUT)
 		newdst = htonl(0x7F000001);
 	else {
 		struct in_device *indev;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 9f5da0c2cae..b36071bb107 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -136,11 +136,9 @@ static inline void send_unreach(struct sk_buff *skb_in, int code)
 }
 
 static unsigned int
-reject_tg(struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, unsigned int hooknum,
-          const struct xt_target *target, const void *targinfo)
+reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ipt_reject_info *reject = targinfo;
+	const struct ipt_reject_info *reject = par->targinfo;
 
 	/* WARNING: This code causes reentry within iptables.
 	   This means that the iptables jump stack is now crap.  We
@@ -168,7 +166,7 @@ reject_tg(struct sk_buff *skb, const struct net_device *in,
 		send_unreach(skb, ICMP_PKT_FILTERED);
 		break;
 	case IPT_TCP_RESET:
-		send_reset(skb, hooknum);
+		send_reset(skb, par->hooknum);
 	case IPT_ICMP_ECHOREPLY:
 		/* Doesn't happen. */
 		break;
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index 7d01d424a71..05cbfd2f747 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -20,12 +20,10 @@ MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target");
 MODULE_LICENSE("GPL");
 
 static unsigned int
-ttl_tg(struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, unsigned int hooknum,
-       const struct xt_target *target, const void *targinfo)
+ttl_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct iphdr *iph;
-	const struct ipt_TTL_info *info = targinfo;
+	const struct ipt_TTL_info *info = par->targinfo;
 	int new_ttl;
 
 	if (!skb_make_writable(skb, skb->len))
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 9065e4a34fb..46c0df0dc2d 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -281,14 +281,10 @@ alloc_failure:
 }
 
 static unsigned int
-ulog_tg(struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, unsigned int hooknum,
-        const struct xt_target *target, const void *targinfo)
+ulog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
-
-	ipt_ulog_packet(hooknum, skb, in, out, loginfo, NULL);
-
+	ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
+	                par->targinfo, NULL);
 	return XT_CONTINUE;
 }
 
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index f929352ec0e..83170ff131f 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -67,25 +67,21 @@ static struct xt_table nat_table = {
 };
 
 /* Source NAT */
-static unsigned int ipt_snat_target(struct sk_buff *skb,
-				    const struct net_device *in,
-				    const struct net_device *out,
-				    unsigned int hooknum,
-				    const struct xt_target *target,
-				    const void *targinfo)
+static unsigned int
+ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
-	NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
+	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
 
 	ct = nf_ct_get(skb, &ctinfo);
 
 	/* Connection must be valid and new. */
 	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
 			    ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
-	NF_CT_ASSERT(out);
+	NF_CT_ASSERT(par->out != NULL);
 
 	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
 }
@@ -109,28 +105,24 @@ static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip)
 	ip_rt_put(rt);
 }
 
-static unsigned int ipt_dnat_target(struct sk_buff *skb,
-				    const struct net_device *in,
-				    const struct net_device *out,
-				    unsigned int hooknum,
-				    const struct xt_target *target,
-				    const void *targinfo)
+static unsigned int
+ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
-	NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING ||
-		     hooknum == NF_INET_LOCAL_OUT);
+	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_OUT);
 
 	ct = nf_ct_get(skb, &ctinfo);
 
 	/* Connection must be valid and new. */
 	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
 
-	if (hooknum == NF_INET_LOCAL_OUT &&
+	if (par->hooknum == NF_INET_LOCAL_OUT &&
 	    mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
-		warn_if_extra_mangle(dev_net(out), ip_hdr(skb)->daddr,
+		warn_if_extra_mangle(dev_net(par->out), ip_hdr(skb)->daddr,
 				     mr->range[0].min_ip);
 
 	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 891358e89a2..ee0986cdbd6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -200,15 +200,11 @@ ip6_checkentry(const struct ip6t_ip6 *ipv6)
 }
 
 static unsigned int
-ip6t_error(struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  unsigned int hooknum,
-	  const struct xt_target *target,
-	  const void *targinfo)
+ip6t_error(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	if (net_ratelimit())
-		printk("ip6_tables: error: `%s'\n", (char *)targinfo);
+		printk("ip6_tables: error: `%s'\n",
+		       (const char *)par->targinfo);
 
 	return NF_DROP;
 }
@@ -360,6 +356,7 @@ ip6t_do_table(struct sk_buff *skb,
 	struct ip6t_entry *e, *back;
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
+	struct xt_target_param tgpar;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -371,8 +368,9 @@ ip6t_do_table(struct sk_buff *skb,
 	 * rule is also a fragment-specific rule, non-fragments won't
 	 * match it. */
 	mtpar.hotdrop = &hotdrop;
-	mtpar.in      = in;
-	mtpar.out     = out;
+	mtpar.in      = tgpar.in  = in;
+	mtpar.out     = tgpar.out = out;
+	tgpar.hooknum = hook;
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
@@ -438,15 +436,15 @@ ip6t_do_table(struct sk_buff *skb,
 			} else {
 				/* Targets which reenter must return
 				   abs. verdicts */
+				tgpar.target   = t->u.kernel.target;
+				tgpar.targinfo = t->data;
+
 #ifdef CONFIG_NETFILTER_DEBUG
 				((struct ip6t_entry *)table_base)->comefrom
 					= 0xeeeeeeec;
 #endif
 				verdict = t->u.kernel.target->target(skb,
-								     in, out,
-								     hook,
-								     t->u.kernel.target,
-								     t->data);
+								     &tgpar);
 
 #ifdef CONFIG_NETFILTER_DEBUG
 				if (((struct ip6t_entry *)table_base)->comefrom
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
index 7eebd350916..ac759a54f2c 100644
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -19,12 +19,10 @@ MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field modification target");
 MODULE_LICENSE("GPL");
 
 static unsigned int
-hl_tg6(struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, unsigned int hooknum,
-       const struct xt_target *target, const void *targinfo)
+hl_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct ipv6hdr *ip6h;
-	const struct ip6t_HL_info *info = targinfo;
+	const struct ip6t_HL_info *info = par->targinfo;
 	int new_hl;
 
 	if (!skb_make_writable(skb, skb->len))
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index fd148f3d842..a31d3ecd1fc 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -438,18 +438,16 @@ ip6t_log_packet(u_int8_t pf,
 }
 
 static unsigned int
-log_tg6(struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, unsigned int hooknum,
-        const struct xt_target *target, const void *targinfo)
+log_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ip6t_log_info *loginfo = targinfo;
+	const struct ip6t_log_info *loginfo = par->targinfo;
 	struct nf_loginfo li;
 
 	li.type = NF_LOG_TYPE_LOG;
 	li.u.log.level = loginfo->level;
 	li.u.log.logflags = loginfo->logflags;
 
-	ip6t_log_packet(NFPROTO_IPV6, hooknum, skb, in, out,
+	ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, par->out,
 			&li, loginfo->prefix);
 	return XT_CONTINUE;
 }
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index f1a9fce1ec9..1d5f3a70ed0 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -173,12 +173,10 @@ send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code,
 }
 
 static unsigned int
-reject_tg6(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+reject_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ip6t_reject_info *reject = targinfo;
-	struct net *net = dev_net(in ? in : out);
+	const struct ip6t_reject_info *reject = par->targinfo;
+	struct net *net = dev_net((par->in != NULL) ? par->in : par->out);
 
 	pr_debug("%s: medium point\n", __func__);
 	/* WARNING: This code causes reentry within ip6tables.
@@ -186,19 +184,19 @@ reject_tg6(struct sk_buff *skb, const struct net_device *in,
 	   must return an absolute verdict. --RR */
 	switch (reject->with) {
 	case IP6T_ICMP6_NO_ROUTE:
-		send_unreach(net, skb, ICMPV6_NOROUTE, hooknum);
+		send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum);
 		break;
 	case IP6T_ICMP6_ADM_PROHIBITED:
-		send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, hooknum);
+		send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum);
 		break;
 	case IP6T_ICMP6_NOT_NEIGHBOUR:
-		send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, hooknum);
+		send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum);
 		break;
 	case IP6T_ICMP6_ADDR_UNREACH:
-		send_unreach(net, skb, ICMPV6_ADDR_UNREACH, hooknum);
+		send_unreach(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum);
 		break;
 	case IP6T_ICMP6_PORT_UNREACH:
-		send_unreach(net, skb, ICMPV6_PORT_UNREACH, hooknum);
+		send_unreach(net, skb, ICMPV6_PORT_UNREACH, par->hooknum);
 		break;
 	case IP6T_ICMP6_ECHOREPLY:
 		/* Do nothing */
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index 8cffa295dd3..011bc80dd2a 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -27,11 +27,9 @@ MODULE_ALIAS("ipt_CLASSIFY");
 MODULE_ALIAS("ip6t_CLASSIFY");
 
 static unsigned int
-classify_tg(struct sk_buff *skb, const struct net_device *in,
-            const struct net_device *out, unsigned int hooknum,
-            const struct xt_target *target, const void *targinfo)
+classify_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_classify_target_info *clinfo = targinfo;
+	const struct xt_classify_target_info *clinfo = par->targinfo;
 
 	skb->priority = clinfo->priority;
 	return XT_CONTINUE;
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index e1415c3f5c9..95ed267328a 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -36,11 +36,9 @@ MODULE_ALIAS("ip6t_CONNMARK");
 #include <net/netfilter/nf_conntrack_ecache.h>
 
 static unsigned int
-connmark_tg_v0(struct sk_buff *skb, const struct net_device *in,
-               const struct net_device *out, unsigned int hooknum,
-               const struct xt_target *target, const void *targinfo)
+connmark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_connmark_target_info *markinfo = targinfo;
+	const struct xt_connmark_target_info *markinfo = par->targinfo;
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	u_int32_t diff;
@@ -77,11 +75,9 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in,
 }
 
 static unsigned int
-connmark_tg(struct sk_buff *skb, const struct net_device *in,
-            const struct net_device *out, unsigned int hooknum,
-            const struct xt_target *target, const void *targinfo)
+connmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_connmark_tginfo1 *info = targinfo;
+	const struct xt_connmark_tginfo1 *info = par->targinfo;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 	u_int32_t newmark;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 5f221c3bd35..2211a2cef28 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -65,11 +65,9 @@ static void secmark_restore(struct sk_buff *skb)
 }
 
 static unsigned int
-connsecmark_tg(struct sk_buff *skb, const struct net_device *in,
-               const struct net_device *out, unsigned int hooknum,
-               const struct xt_target *target, const void *targinfo)
+connsecmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_connsecmark_target_info *info = targinfo;
+	const struct xt_connsecmark_target_info *info = par->targinfo;
 
 	switch (info->mode) {
 	case CONNSECMARK_SAVE:
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index f0b4958528e..c78e80afdf3 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -29,11 +29,9 @@ MODULE_ALIAS("ipt_TOS");
 MODULE_ALIAS("ip6t_TOS");
 
 static unsigned int
-dscp_tg(struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, unsigned int hooknum,
-        const struct xt_target *target, const void *targinfo)
+dscp_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_DSCP_info *dinfo = targinfo;
+	const struct xt_DSCP_info *dinfo = par->targinfo;
 	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	if (dscp != dinfo->dscp) {
@@ -48,11 +46,9 @@ dscp_tg(struct sk_buff *skb, const struct net_device *in,
 }
 
 static unsigned int
-dscp_tg6(struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, unsigned int hooknum,
-         const struct xt_target *target, const void *targinfo)
+dscp_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_DSCP_info *dinfo = targinfo;
+	const struct xt_DSCP_info *dinfo = par->targinfo;
 	u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	if (dscp != dinfo->dscp) {
@@ -80,11 +76,9 @@ dscp_tg_check(const char *tablename, const void *e_void,
 }
 
 static unsigned int
-tos_tg_v0(struct sk_buff *skb, const struct net_device *in,
-          const struct net_device *out, unsigned int hooknum,
-          const struct xt_target *target, const void *targinfo)
+tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct ipt_tos_target_info *info = targinfo;
+	const struct ipt_tos_target_info *info = par->targinfo;
 	struct iphdr *iph = ip_hdr(skb);
 	u_int8_t oldtos;
 
@@ -119,11 +113,9 @@ tos_tg_check_v0(const char *tablename, const void *e_void,
 }
 
 static unsigned int
-tos_tg(struct sk_buff *skb, const struct net_device *in,
-       const struct net_device *out, unsigned int hooknum,
-       const struct xt_target *target, const void *targinfo)
+tos_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_tos_target_info *info = targinfo;
+	const struct xt_tos_target_info *info = par->targinfo;
 	struct iphdr *iph = ip_hdr(skb);
 	u_int8_t orig, nv;
 
@@ -141,11 +133,9 @@ tos_tg(struct sk_buff *skb, const struct net_device *in,
 }
 
 static unsigned int
-tos_tg6(struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, unsigned int hooknum,
-        const struct xt_target *target, const void *targinfo)
+tos_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_tos_target_info *info = targinfo;
+	const struct xt_tos_target_info *info = par->targinfo;
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 	u_int8_t orig, nv;
 
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index c8ea7a80970..27d03f39611 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -25,22 +25,18 @@ MODULE_ALIAS("ipt_MARK");
 MODULE_ALIAS("ip6t_MARK");
 
 static unsigned int
-mark_tg_v0(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+mark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_mark_target_info *markinfo = targinfo;
+	const struct xt_mark_target_info *markinfo = par->targinfo;
 
 	skb->mark = markinfo->mark;
 	return XT_CONTINUE;
 }
 
 static unsigned int
-mark_tg_v1(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+mark_tg_v1(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_mark_target_info_v1 *markinfo = targinfo;
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
 	int mark = 0;
 
 	switch (markinfo->mode) {
@@ -62,11 +58,9 @@ mark_tg_v1(struct sk_buff *skb, const struct net_device *in,
 }
 
 static unsigned int
-mark_tg(struct sk_buff *skb, const struct net_device *in,
-        const struct net_device *out, unsigned int hooknum,
-        const struct xt_target *target, const void *targinfo)
+mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_mark_tginfo2 *info = targinfo;
+	const struct xt_mark_tginfo2 *info = par->targinfo;
 
 	skb->mark = (skb->mark & ~info->mask) ^ info->mark;
 	return XT_CONTINUE;
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 9b095520176..3218ad63bd1 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -21,11 +21,9 @@ MODULE_ALIAS("ipt_NFLOG");
 MODULE_ALIAS("ip6t_NFLOG");
 
 static unsigned int
-nflog_tg(struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, unsigned int hooknum,
-         const struct xt_target *target, const void *targinfo)
+nflog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_nflog_info *info = targinfo;
+	const struct xt_nflog_info *info = par->targinfo;
 	struct nf_loginfo li;
 
 	li.type		     = NF_LOG_TYPE_ULOG;
@@ -33,8 +31,8 @@ nflog_tg(struct sk_buff *skb, const struct net_device *in,
 	li.u.ulog.group	     = info->group;
 	li.u.ulog.qthreshold = info->threshold;
 
-	nf_log_packet(target->family, hooknum, skb, in, out, &li,
-		      "%s", info->prefix);
+	nf_log_packet(par->target->family, par->hooknum, skb, par->in,
+	              par->out, &li, "%s", info->prefix);
 	return XT_CONTINUE;
 }
 
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index c03c2e8d06f..2cc1fff4930 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -24,11 +24,9 @@ MODULE_ALIAS("ip6t_NFQUEUE");
 MODULE_ALIAS("arpt_NFQUEUE");
 
 static unsigned int
-nfqueue_tg(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+nfqueue_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_NFQ_info *tinfo = targinfo;
+	const struct xt_NFQ_info *tinfo = par->targinfo;
 
 	return NF_QUEUE_NR(tinfo->queuenum);
 }
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index b9ee268b37c..cc50295cd11 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -13,9 +13,7 @@ MODULE_ALIAS("ipt_NOTRACK");
 MODULE_ALIAS("ip6t_NOTRACK");
 
 static unsigned int
-notrack_tg(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+notrack_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	/* Previously seen (loopback)? Ignore. */
 	if (skb->nfct != NULL)
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index da7946e6ecb..92e33524f78 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -71,14 +71,9 @@ void xt_rateest_put(struct xt_rateest *est)
 EXPORT_SYMBOL_GPL(xt_rateest_put);
 
 static unsigned int
-xt_rateest_tg(struct sk_buff *skb,
-	      const struct net_device *in,
-	      const struct net_device *out,
-	      unsigned int hooknum,
-	      const struct xt_target *target,
-	      const void *targinfo)
+xt_rateest_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	const struct xt_rateest_target_info *info = targinfo;
+	const struct xt_rateest_target_info *info = par->targinfo;
 	struct gnet_stats_basic *stats = &info->est->bstats;
 
 	spin_lock_bh(&info->est->lock);
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 2a2ab833481..ad05214e380 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -29,12 +29,10 @@ MODULE_ALIAS("ip6t_SECMARK");
 static u8 mode;
 
 static unsigned int
-secmark_tg(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+secmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	u32 secmark = 0;
-	const struct xt_secmark_target_info *info = targinfo;
+	const struct xt_secmark_target_info *info = par->targinfo;
 
 	BUG_ON(info->mode != mode);
 
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index b868f995239..e08762d9b0f 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -174,15 +174,13 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
 }
 
 static unsigned int
-tcpmss_tg4(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+tcpmss_tg4(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct iphdr *iph = ip_hdr(skb);
 	__be16 newlen;
 	int ret;
 
-	ret = tcpmss_mangle_packet(skb, targinfo,
+	ret = tcpmss_mangle_packet(skb, par->targinfo,
 				   tcpmss_reverse_mtu(skb, PF_INET),
 				   iph->ihl * 4,
 				   sizeof(*iph) + sizeof(struct tcphdr));
@@ -199,9 +197,7 @@ tcpmss_tg4(struct sk_buff *skb, const struct net_device *in,
 
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 static unsigned int
-tcpmss_tg6(struct sk_buff *skb, const struct net_device *in,
-           const struct net_device *out, unsigned int hooknum,
-           const struct xt_target *target, const void *targinfo)
+tcpmss_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	u8 nexthdr;
@@ -212,7 +208,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct net_device *in,
 	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr);
 	if (tcphoff < 0)
 		return NF_DROP;
-	ret = tcpmss_mangle_packet(skb, targinfo,
+	ret = tcpmss_mangle_packet(skb, par->targinfo,
 				   tcpmss_reverse_mtu(skb, PF_INET6),
 				   tcphoff,
 				   sizeof(*ipv6h) + sizeof(struct tcphdr));
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 2e0ae6cc5d9..9dd8c8ef63e 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -75,19 +75,15 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
 }
 
 static unsigned int
-tcpoptstrip_tg4(struct sk_buff *skb, const struct net_device *in,
-		const struct net_device *out, unsigned int hooknum,
-		const struct xt_target *target, const void *targinfo)
+tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_target_param *par)
 {
-	return tcpoptstrip_mangle_packet(skb, targinfo, ip_hdrlen(skb),
+	return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb),
 	       sizeof(struct iphdr) + sizeof(struct tcphdr));
 }
 
 #if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE)
 static unsigned int
-tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in,
-		const struct net_device *out, unsigned int hooknum,
-		const struct xt_target *target, const void *targinfo)
+tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	int tcphoff;
@@ -98,7 +94,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in,
 	if (tcphoff < 0)
 		return NF_DROP;
 
-	return tcpoptstrip_mangle_packet(skb, targinfo, tcphoff,
+	return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff,
 	       sizeof(*ipv6h) + sizeof(struct tcphdr));
 }
 #endif
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 183f251d2f0..f08c49ea4bd 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -25,15 +25,10 @@
 #include <net/netfilter/nf_tproxy_core.h>
 
 static unsigned int
-tproxy_tg(struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  unsigned int hooknum,
-	  const struct xt_target *target,
-	  const void *targinfo)
+tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	const struct xt_tproxy_target_info *tgi = targinfo;
+	const struct xt_tproxy_target_info *tgi = par->targinfo;
 	struct udphdr _hdr, *hp;
 	struct sock *sk;
 
@@ -44,7 +39,7 @@ tproxy_tg(struct sk_buff *skb,
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
 				   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
 				   hp->source, tgi->lport ? tgi->lport : hp->dest,
-				   in, true);
+				   par->in, true);
 
 	/* NOTE: assign_sock consumes our sk reference */
 	if (sk && nf_tproxy_assign_sock(skb, sk)) {
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index da35f9f1cd7..fbb04b86c46 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -11,9 +11,7 @@ MODULE_ALIAS("ipt_TRACE");
 MODULE_ALIAS("ip6t_TRACE");
 
 static unsigned int
-trace_tg(struct sk_buff *skb, const struct net_device *in,
-         const struct net_device *out, unsigned int hooknum,
-         const struct xt_target *target, const void *targinfo)
+trace_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	skb->nf_trace = 1;
 	return XT_CONTINUE;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 79ea19375ca..89791a56429 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -188,6 +188,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
 {
 	int ret = 0, result = 0;
 	struct tcf_ipt *ipt = a->priv;
+	struct xt_target_param par;
 
 	if (skb_cloned(skb)) {
 		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
@@ -203,10 +204,13 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
 	/* yes, we have to worry about both in and out dev
 	 worry later - danger - this API seems to have changed
 	 from earlier kernels */
-	ret = ipt->tcfi_t->u.kernel.target->target(skb, skb->dev, NULL,
-						   ipt->tcfi_hook,
-						   ipt->tcfi_t->u.kernel.target,
-						   ipt->tcfi_t->data);
+	par.in       = skb->dev;
+	par.out      = NULL;
+	par.hooknum  = ipt->tcfi_hook;
+	par.target   = ipt->tcfi_t->u.kernel.target;
+	par.targinfo = ipt->tcfi_t->data;
+	ret = par.target->target(skb, &par);
+
 	switch (ret) {
 	case NF_ACCEPT:
 		result = TC_ACT_OK;
-- 
cgit v1.2.3


From af5d6dc200eb0fcc6fbd3df1ab4d8969004cb37f Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:19 +0200
Subject: netfilter: xtables: move extension arguments into compound structure
 (5/6)

This patch does this for target extensions' checkentry functions.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebt_arpreply.c | 10 +++-------
 net/bridge/netfilter/ebt_dnat.c     | 19 ++++++++++---------
 net/bridge/netfilter/ebt_log.c      |  7 ++-----
 net/bridge/netfilter/ebt_mark.c     |  8 ++------
 net/bridge/netfilter/ebt_nflog.c    |  7 ++-----
 net/bridge/netfilter/ebt_redirect.c | 17 +++++++++--------
 net/bridge/netfilter/ebt_snat.c     |  8 ++------
 net/bridge/netfilter/ebt_ulog.c     |  7 ++-----
 net/bridge/netfilter/ebtables.c     | 28 ++++++++++++++++------------
 net/ipv4/netfilter/arp_tables.c     | 20 +++++++++++---------
 net/ipv4/netfilter/arpt_mangle.c    |  6 ++----
 net/ipv4/netfilter/ip_tables.c      | 17 ++++++++++-------
 net/ipv4/netfilter/ipt_CLUSTERIP.c  | 13 +++++--------
 net/ipv4/netfilter/ipt_ECN.c        |  9 +++------
 net/ipv4/netfilter/ipt_LOG.c        |  7 ++-----
 net/ipv4/netfilter/ipt_MASQUERADE.c |  7 ++-----
 net/ipv4/netfilter/ipt_NETMAP.c     |  7 ++-----
 net/ipv4/netfilter/ipt_REDIRECT.c   |  7 ++-----
 net/ipv4/netfilter/ipt_REJECT.c     |  9 +++------
 net/ipv4/netfilter/ipt_TTL.c        |  7 ++-----
 net/ipv4/netfilter/ipt_ULOG.c       |  7 ++-----
 net/ipv4/netfilter/nf_nat_rule.c    | 16 ++++------------
 net/ipv6/netfilter/ip6_tables.c     | 16 ++++++++++------
 net/ipv6/netfilter/ip6t_HL.c        |  7 ++-----
 net/ipv6/netfilter/ip6t_LOG.c       |  7 ++-----
 net/ipv6/netfilter/ip6t_REJECT.c    |  9 +++------
 net/netfilter/x_tables.c            | 32 ++++++++++++++++----------------
 net/netfilter/xt_CONNMARK.c         | 24 +++++++++---------------
 net/netfilter/xt_CONNSECMARK.c      | 16 +++++++---------
 net/netfilter/xt_DSCP.c             | 19 +++++++------------
 net/netfilter/xt_MARK.c             | 14 ++++----------
 net/netfilter/xt_NFLOG.c            |  7 ++-----
 net/netfilter/xt_RATEEST.c          |  9 ++-------
 net/netfilter/xt_SECMARK.c          | 12 +++++-------
 net/netfilter/xt_TCPMSS.c           | 22 ++++++++--------------
 net/netfilter/xt_TPROXY.c           |  9 ++-------
 net/sched/act_ipt.c                 | 12 +++++++++---
 37 files changed, 186 insertions(+), 272 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index fc94699f719..76584cd72e5 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -57,20 +57,16 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return info->target;
 }
 
-static bool
-ebt_arpreply_tg_check(const char *tablename, const void *entry,
-		      const struct xt_target *target, void *data,
-		      unsigned int hookmask)
+static bool ebt_arpreply_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ebt_arpreply_info *info = data;
-	const struct ebt_entry *e = entry;
+	const struct ebt_arpreply_info *info = par->targinfo;
+	const struct ebt_entry *e = par->entryinfo;
 
 	if (BASE_CHAIN && info->target == EBT_RETURN)
 		return false;
 	if (e->ethproto != htons(ETH_P_ARP) ||
 	    e->invflags & EBT_IPROTO)
 		return false;
-	CLEAR_BASE_CHAIN_BIT;
 	return true;
 }
 
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index bb5d79e0bee..6b49ea9e31f 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -26,19 +26,20 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return info->target;
 }
 
-static bool
-ebt_dnat_tg_check(const char *tablename, const void *entry,
-		  const struct xt_target *target, void *data,
-		  unsigned int hookmask)
+static bool ebt_dnat_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ebt_nat_info *info = data;
+	const struct ebt_nat_info *info = par->targinfo;
+	unsigned int hook_mask;
 
 	if (BASE_CHAIN && info->target == EBT_RETURN)
 		return false;
-	CLEAR_BASE_CHAIN_BIT;
-	if ( (strcmp(tablename, "nat") ||
-	   (hookmask & ~((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT)))) &&
-	   (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) )
+
+	hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+	if ((strcmp(par->table, "nat") != 0 ||
+	    (hook_mask & ~((1 << NF_BR_PRE_ROUTING) |
+	    (1 << NF_BR_LOCAL_OUT)))) &&
+	    (strcmp(par->table, "broute") != 0 ||
+	    hook_mask & ~(1 << NF_BR_BROUTING)))
 		return false;
 	if (INVALID_TARGET)
 		return false;
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 87de5fccb2f..3d33c608906 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -24,12 +24,9 @@
 
 static DEFINE_SPINLOCK(ebt_log_lock);
 
-static bool
-ebt_log_tg_check(const char *table, const void *entry,
-		 const struct xt_target *target, void *data,
-		 unsigned int hook_mask)
+static bool ebt_log_tg_check(const struct xt_tgchk_param *par)
 {
-	struct ebt_log_info *info = data;
+	struct ebt_log_info *info = par->targinfo;
 
 	if (info->bitmask & ~EBT_LOG_MASK)
 		return false;
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index aafc456c3c3..2fee7e8e2e9 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -36,18 +36,14 @@ ebt_mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return info->target | ~EBT_VERDICT_BITS;
 }
 
-static bool
-ebt_mark_tg_check(const char *table, const void *e,
-		  const struct xt_target *target, void *data,
-		  unsigned int hookmask)
+static bool ebt_mark_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ebt_mark_t_info *info = data;
+	const struct ebt_mark_t_info *info = par->targinfo;
 	int tmp;
 
 	tmp = info->target | ~EBT_VERDICT_BITS;
 	if (BASE_CHAIN && tmp == EBT_RETURN)
 		return false;
-	CLEAR_BASE_CHAIN_BIT;
 	if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
 		return false;
 	tmp = info->target & ~EBT_VERDICT_BITS;
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 6a28d994cf7..2a63d996dd4 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -35,12 +35,9 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return EBT_CONTINUE;
 }
 
-static bool
-ebt_nflog_tg_check(const char *table, const void *e,
-		   const struct xt_target *target, void *data,
-		   unsigned int hookmask)
+static bool ebt_nflog_tg_check(const struct xt_tgchk_param *par)
 {
-	struct ebt_nflog_info *info = data;
+	struct ebt_nflog_info *info = par->targinfo;
 
 	if (info->flags & ~EBT_NFLOG_MASK)
 		return false;
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 0cfe2fad940..c8a49f7a57b 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -32,18 +32,19 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return info->target;
 }
 
-static bool
-ebt_redirect_tg_check(const char *tablename, const void *e,
-		      const struct xt_target *target, void *data,
-		      unsigned int hookmask)
+static bool ebt_redirect_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ebt_redirect_info *info = data;
+	const struct ebt_redirect_info *info = par->targinfo;
+	unsigned int hook_mask;
 
 	if (BASE_CHAIN && info->target == EBT_RETURN)
 		return false;
-	CLEAR_BASE_CHAIN_BIT;
-	if ( (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING)) &&
-	     (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) )
+
+	hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+	if ((strcmp(par->table, "nat") != 0 ||
+	    hook_mask & ~(1 << NF_BR_PRE_ROUTING)) &&
+	    (strcmp(par->table, "broute") != 0 ||
+	    hook_mask & ~(1 << NF_BR_BROUTING)))
 		return false;
 	if (INVALID_TARGET)
 		return false;
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index f55960eee99..8d04d4c302b 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -42,18 +42,14 @@ out:
 	return info->target | ~EBT_VERDICT_BITS;
 }
 
-static bool
-ebt_snat_tg_check(const char *tablename, const void *e,
-		  const struct xt_target *target, void *data,
-		  unsigned int hookmask)
+static bool ebt_snat_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ebt_nat_info *info = data;
+	const struct ebt_nat_info *info = par->targinfo;
 	int tmp;
 
 	tmp = info->target | ~EBT_VERDICT_BITS;
 	if (BASE_CHAIN && tmp == EBT_RETURN)
 		return false;
-	CLEAR_BASE_CHAIN_BIT;
 
 	if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
 		return false;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index bfedf12cbf4..2c6d6823e70 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -254,12 +254,9 @@ ebt_ulog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return EBT_CONTINUE;
 }
 
-static bool
-ebt_ulog_tg_check(const char *table, const void *entry,
-		  const struct xt_target *target, void *data,
-		  unsigned int hookmask)
+static bool ebt_ulog_tg_check(const struct xt_tgchk_param *par)
 {
-	struct ebt_ulog_info *uloginfo = data;
+	struct ebt_ulog_info *uloginfo = par->targinfo;
 
 	if (uloginfo->nlgroup > 31)
 		return false;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index a1156bab4a0..cf823c21c16 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -363,9 +363,10 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
 }
 
 static inline int
-ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
-   const char *name, unsigned int hookmask, unsigned int *cnt)
+ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par,
+		  unsigned int *cnt)
 {
+	const struct ebt_entry *e = par->entryinfo;
 	struct xt_target *watcher;
 	size_t left = ((char *)e + e->target_offset) - (char *)w;
 	int ret;
@@ -383,9 +384,10 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
 		return -ENOENT;
 	w->u.watcher = watcher;
 
-	ret = xt_check_target(watcher, NFPROTO_BRIDGE, w->watcher_size,
-	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO,
-	      e, w->data);
+	par->target   = watcher;
+	par->targinfo = w->data;
+	ret = xt_check_target(par, NFPROTO_BRIDGE, w->watcher_size,
+	      e->ethproto, e->invflags & EBT_IPROTO);
 	if (ret < 0) {
 		module_put(watcher->me);
 		return ret;
@@ -619,6 +621,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	size_t gap;
 	int ret;
 	struct xt_mtchk_param mtpar;
+	struct xt_tgchk_param tgpar;
 
 	/* don't mess with the struct ebt_entries */
 	if (e->bitmask == 0)
@@ -660,14 +663,14 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	}
 	i = 0;
 
-	mtpar.table     = name;
-	mtpar.entryinfo = e;
-	mtpar.hook_mask = hookmask;
+	mtpar.table     = tgpar.table     = name;
+	mtpar.entryinfo = tgpar.entryinfo = e;
+	mtpar.hook_mask = tgpar.hook_mask = hookmask;
 	ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i);
 	if (ret != 0)
 		goto cleanup_matches;
 	j = 0;
-	ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, e, name, hookmask, &j);
+	ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
 	if (ret != 0)
 		goto cleanup_watchers;
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
@@ -703,9 +706,10 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 		goto cleanup_watchers;
 	}
 
-	ret = xt_check_target(target, NFPROTO_BRIDGE, t->target_size,
-	      name, hookmask, e->ethproto, e->invflags & EBT_IPROTO,
-	      e, t->data);
+	tgpar.target   = target;
+	tgpar.targinfo = t->data;
+	ret = xt_check_target(&tgpar, NFPROTO_BRIDGE, t->target_size,
+	      e->ethproto, e->invflags & EBT_IPROTO);
 	if (ret < 0) {
 		module_put(target->me);
 		goto cleanup_watchers;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5b631ad74b5..b3238d0101c 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -457,16 +457,18 @@ static inline int check_entry(struct arpt_entry *e, const char *name)
 
 static inline int check_target(struct arpt_entry *e, const char *name)
 {
-	struct arpt_entry_target *t;
-	struct xt_target *target;
+	struct arpt_entry_target *t = arpt_get_target(e);
 	int ret;
-
-	t = arpt_get_target(e);
-	target = t->u.kernel.target;
-
-	ret = xt_check_target(target, NFPROTO_ARP,
-			      t->u.target_size - sizeof(*t),
-			      name, e->comefrom, 0, 0, e, t->data);
+	struct xt_tgchk_param par = {
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+	};
+
+	ret = xt_check_target(&par, NFPROTO_ARP,
+	      t->u.target_size - sizeof(*t), 0, false);
 	if (ret < 0) {
 		duprintf("arp_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 0bf81b35369..b0d5b1d0a76 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -54,11 +54,9 @@ target(struct sk_buff *skb, const struct xt_target_param *par)
 	return mangle->target;
 }
 
-static bool
-checkentry(const char *tablename, const void *e, const struct xt_target *target,
-	   void *targinfo, unsigned int hook_mask)
+static bool checkentry(const struct xt_tgchk_param *par)
 {
-	const struct arpt_mangle *mangle = targinfo;
+	const struct arpt_mangle *mangle = par->targinfo;
 
 	if (mangle->flags & ~ARPT_MANGLE_MASK ||
 	    !(mangle->flags & ARPT_MANGLE_MASK))
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 0f8ecf39022..e592c54d499 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -655,15 +655,18 @@ err:
 
 static int check_target(struct ipt_entry *e, const char *name)
 {
-	struct ipt_entry_target *t;
-	struct xt_target *target;
+	struct ipt_entry_target *t = ipt_get_target(e);
+	struct xt_tgchk_param par = {
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+	};
 	int ret;
 
-	t = ipt_get_target(e);
-	target = t->u.kernel.target;
-	ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
-			      name, e->comefrom, e->ip.proto,
-			      e->ip.invflags & IPT_INV_PROTO, e, t->data);
+	ret = xt_check_target(&par, NFPROTO_IPV4, t->u.target_size - sizeof(*t),
+	      e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 67e8aa8f34f..6c7254e0256 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -347,13 +347,10 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-clusterip_tg_check(const char *tablename, const void *e_void,
-                   const struct xt_target *target, void *targinfo,
-                   unsigned int hook_mask)
+static bool clusterip_tg_check(const struct xt_tgchk_param *par)
 {
-	struct ipt_clusterip_tgt_info *cipinfo = targinfo;
-	const struct ipt_entry *e = e_void;
+	struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
 
 	struct clusterip_config *config;
 
@@ -404,9 +401,9 @@ clusterip_tg_check(const char *tablename, const void *e_void,
 	}
 	cipinfo->config = config;
 
-	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", target->family);
+				    "proto=%u\n", par->target->family);
 		return false;
 	}
 
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index e37f181e829..f7e2fa0974d 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -93,13 +93,10 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-ecn_tg_check(const char *tablename, const void *e_void,
-             const struct xt_target *target, void *targinfo,
-             unsigned int hook_mask)
+static bool ecn_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ipt_ECN_info *einfo = targinfo;
-	const struct ipt_entry *e = e_void;
+	const struct ipt_ECN_info *einfo = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
 
 	if (einfo->operation & IPT_ECN_OP_MASK) {
 		printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index e9942aed35a..fc6ce04a3e3 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -440,12 +440,9 @@ log_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-log_tg_check(const char *tablename, const void *e,
-             const struct xt_target *target, void *targinfo,
-             unsigned int hook_mask)
+static bool log_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ipt_log_info *loginfo = targinfo;
+	const struct ipt_log_info *loginfo = par->targinfo;
 
 	if (loginfo->level >= 8) {
 		pr_debug("LOG: level %u >= 8\n", loginfo->level);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index e0d9d49b79e..f389f60cb10 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -31,12 +31,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
 static DEFINE_RWLOCK(masq_lock);
 
 /* FIXME: Multiple targets. --RR */
-static bool
-masquerade_tg_check(const char *tablename, const void *e,
-                    const struct xt_target *target, void *targinfo,
-                    unsigned int hook_mask)
+static bool masquerade_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
 	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
 		pr_debug("masquerade_check: bad MAP_IPS.\n");
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index cf18f23b346..7c29582d4ec 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -22,12 +22,9 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
 MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
 
-static bool
-netmap_tg_check(const char *tablename, const void *e,
-                const struct xt_target *target, void *targinfo,
-                unsigned int hook_mask)
+static bool netmap_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
 	if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
 		pr_debug("NETMAP:check: bad MAP_IPS.\n");
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 23adb09ddfb..698e5e78685 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -26,12 +26,9 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
 
 /* FIXME: Take multiple ranges --RR */
-static bool
-redirect_tg_check(const char *tablename, const void *e,
-                  const struct xt_target *target, void *targinfo,
-                  unsigned int hook_mask)
+static bool redirect_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
 	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
 		pr_debug("redirect_check: bad MAP_IPS.\n");
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b36071bb107..0b4b6e0ff2b 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -175,13 +175,10 @@ reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return NF_DROP;
 }
 
-static bool
-reject_tg_check(const char *tablename, const void *e_void,
-                const struct xt_target *target, void *targinfo,
-                unsigned int hook_mask)
+static bool reject_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ipt_reject_info *rejinfo = targinfo;
-	const struct ipt_entry *e = e_void;
+	const struct ipt_reject_info *rejinfo = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
 
 	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
 		printk("ipt_REJECT: ECHOREPLY no longer supported.\n");
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index 05cbfd2f747..6d76aae90cc 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -59,12 +59,9 @@ ttl_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-ttl_tg_check(const char *tablename, const void *e,
-             const struct xt_target *target, void *targinfo,
-             unsigned int hook_mask)
+static bool ttl_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ipt_TTL_info *info = targinfo;
+	const struct ipt_TTL_info *info = par->targinfo;
 
 	if (info->mode > IPT_TTL_MAXMODE) {
 		printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 46c0df0dc2d..18a2826b57c 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -313,12 +313,9 @@ static void ipt_logfn(u_int8_t pf,
 	ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
 }
 
-static bool
-ulog_tg_check(const char *tablename, const void *e,
-              const struct xt_target *target, void *targinfo,
-              unsigned int hookmask)
+static bool ulog_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ipt_ulog_info *loginfo = targinfo;
+	const struct ipt_ulog_info *loginfo = par->targinfo;
 
 	if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
 		pr_debug("ipt_ULOG: prefix term %i\n",
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 83170ff131f..bea54a68510 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -128,13 +128,9 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
 	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
 }
 
-static bool ipt_snat_checkentry(const char *tablename,
-				const void *entry,
-				const struct xt_target *target,
-				void *targinfo,
-				unsigned int hook_mask)
+static bool ipt_snat_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
 	/* Must be a valid range */
 	if (mr->rangesize != 1) {
@@ -144,13 +140,9 @@ static bool ipt_snat_checkentry(const char *tablename,
 	return true;
 }
 
-static bool ipt_dnat_checkentry(const char *tablename,
-				const void *entry,
-				const struct xt_target *target,
-				void *targinfo,
-				unsigned int hook_mask)
+static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_multi_range_compat *mr = targinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
 	/* Must be a valid range */
 	if (mr->rangesize != 1) {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ee0986cdbd6..ca14fb8bd36 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -679,15 +679,19 @@ err:
 
 static int check_target(struct ip6t_entry *e, const char *name)
 {
-	struct ip6t_entry_target *t;
-	struct xt_target *target;
+	struct ip6t_entry_target *t = ip6t_get_target(e);
+	struct xt_tgchk_param par = {
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+	};
 	int ret;
 
 	t = ip6t_get_target(e);
-	target = t->u.kernel.target;
-	ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t),
-			      name, e->comefrom, e->ipv6.proto,
-			      e->ipv6.invflags & IP6T_INV_PROTO, e, t->data);
+	ret = xt_check_target(&par, NFPROTO_IPV6, t->u.target_size - sizeof(*t),
+	      e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
index ac759a54f2c..27b5adf670a 100644
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -54,12 +54,9 @@ hl_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-hl_tg6_check(const char *tablename, const void *entry,
-             const struct xt_target *target, void *targinfo,
-             unsigned int hook_mask)
+static bool hl_tg6_check(const struct xt_tgchk_param *par)
 {
-	const struct ip6t_HL_info *info = targinfo;
+	const struct ip6t_HL_info *info = par->targinfo;
 
 	if (info->mode > IP6T_HL_MAXMODE) {
 		printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n",
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index a31d3ecd1fc..caa441d0956 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -453,12 +453,9 @@ log_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 }
 
 
-static bool
-log_tg6_check(const char *tablename, const void *entry,
-              const struct xt_target *target, void *targinfo,
-              unsigned int hook_mask)
+static bool log_tg6_check(const struct xt_tgchk_param *par)
 {
-	const struct ip6t_log_info *loginfo = targinfo;
+	const struct ip6t_log_info *loginfo = par->targinfo;
 
 	if (loginfo->level >= 8) {
 		pr_debug("LOG: level %u >= 8\n", loginfo->level);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 1d5f3a70ed0..0981b4ccb8b 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -213,13 +213,10 @@ reject_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 	return NF_DROP;
 }
 
-static bool
-reject_tg6_check(const char *tablename, const void *entry,
-                 const struct xt_target *target, void *targinfo,
-                 unsigned int hook_mask)
+static bool reject_tg6_check(const struct xt_tgchk_param *par)
 {
-	const struct ip6t_reject_info *rejinfo = targinfo;
-	const struct ip6t_entry *e = entry;
+	const struct ip6t_reject_info *rejinfo = par->targinfo;
+	const struct ip6t_entry *e = par->entryinfo;
 
 	if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
 		printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 817ab14f7cd..f29513cd139 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -471,35 +471,35 @@ int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr,
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
 #endif /* CONFIG_COMPAT */
 
-int xt_check_target(const struct xt_target *target, unsigned short family,
-		    unsigned int size, const char *table, unsigned int hook_mask,
-		    unsigned short proto, int inv_proto, const void *entry,
-		    void *targinfo)
+int xt_check_target(struct xt_tgchk_param *par, u_int8_t family,
+		    unsigned int size, u_int8_t proto, bool inv_proto)
 {
-	if (XT_ALIGN(target->targetsize) != size) {
+	if (XT_ALIGN(par->target->targetsize) != size) {
 		printk("%s_tables: %s target: invalid size %Zu != %u\n",
-		       xt_prefix[family], target->name,
-		       XT_ALIGN(target->targetsize), size);
+		       xt_prefix[family], par->target->name,
+		       XT_ALIGN(par->target->targetsize), size);
 		return -EINVAL;
 	}
-	if (target->table && strcmp(target->table, table)) {
+	if (par->target->table != NULL &&
+	    strcmp(par->target->table, par->table) != 0) {
 		printk("%s_tables: %s target: only valid in %s table, not %s\n",
-		       xt_prefix[family], target->name, target->table, table);
+		       xt_prefix[family], par->target->name,
+		       par->target->table, par->table);
 		return -EINVAL;
 	}
-	if (target->hooks && (hook_mask & ~target->hooks) != 0) {
+	if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
 		printk("%s_tables: %s target: bad hook_mask %#x/%#x\n",
-		       xt_prefix[family], target->name, hook_mask,
-		       target->hooks);
+		       xt_prefix[family], par->target->name, par->hook_mask,
+		       par->target->hooks);
 		return -EINVAL;
 	}
-	if (target->proto && (target->proto != proto || inv_proto)) {
+	if (par->target->proto && (par->target->proto != proto || inv_proto)) {
 		printk("%s_tables: %s target: only valid for protocol %u\n",
-		       xt_prefix[family], target->name, target->proto);
+		       xt_prefix[family], par->target->name,
+		       par->target->proto);
 		return -EINVAL;
 	}
-	if (target->checkentry != NULL &&
-	    !target->checkentry(table, entry, target, targinfo, hook_mask))
+	if (par->target->checkentry != NULL && !par->target->checkentry(par))
 		return -EINVAL;
 	return 0;
 }
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index 95ed267328a..8fc9f35e67d 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -112,18 +112,15 @@ connmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-connmark_tg_check_v0(const char *tablename, const void *entry,
-                     const struct xt_target *target, void *targinfo,
-                     unsigned int hook_mask)
+static bool connmark_tg_check_v0(const struct xt_tgchk_param *par)
 {
-	const struct xt_connmark_target_info *matchinfo = targinfo;
+	const struct xt_connmark_target_info *matchinfo = par->targinfo;
 
 	if (matchinfo->mode == XT_CONNMARK_RESTORE) {
-		if (strcmp(tablename, "mangle") != 0) {
+		if (strcmp(par->table, "mangle") != 0) {
 			printk(KERN_WARNING "CONNMARK: restore can only be "
 			       "called from \"mangle\" table, not \"%s\"\n",
-			       tablename);
+			       par->table);
 			return false;
 		}
 	}
@@ -131,22 +128,19 @@ connmark_tg_check_v0(const char *tablename, const void *entry,
 		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
 		return false;
 	}
-	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", target->family);
+				    "proto=%u\n", par->target->family);
 		return false;
 	}
 	return true;
 }
 
-static bool
-connmark_tg_check(const char *tablename, const void *entry,
-                  const struct xt_target *target, void *targinfo,
-                  unsigned int hook_mask)
+static bool connmark_tg_check(const struct xt_tgchk_param *par)
 {
-	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
 		printk(KERN_WARNING "cannot load conntrack support for "
-		       "proto=%u\n", target->family);
+		       "proto=%u\n", par->target->family);
 		return false;
 	}
 	return true;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 2211a2cef28..2041a3d4b4d 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -85,16 +85,14 @@ connsecmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-connsecmark_tg_check(const char *tablename, const void *entry,
-                     const struct xt_target *target, void *targinfo,
-                     unsigned int hook_mask)
+static bool connsecmark_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct xt_connsecmark_target_info *info = targinfo;
+	const struct xt_connsecmark_target_info *info = par->targinfo;
 
-	if (strcmp(tablename, "mangle") && strcmp(tablename, "security")) {
+	if (strcmp(par->table, "mangle") != 0 &&
+	    strcmp(par->table, "security") != 0) {
 		printk(KERN_INFO PFX "target only valid in the \'mangle\' "
-		       "or \'security\' tables, not \'%s\'.\n", tablename);
+		       "or \'security\' tables, not \'%s\'.\n", par->table);
 		return false;
 	}
 
@@ -108,9 +106,9 @@ connsecmark_tg_check(const char *tablename, const void *entry,
 		return false;
 	}
 
-	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", target->family);
+				    "proto=%u\n", par->target->family);
 		return false;
 	}
 	return true;
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index c78e80afdf3..6a347e768f8 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -61,15 +61,12 @@ dscp_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-dscp_tg_check(const char *tablename, const void *e_void,
-              const struct xt_target *target, void *targinfo,
-              unsigned int hook_mask)
+static bool dscp_tg_check(const struct xt_tgchk_param *par)
 {
-	const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp;
+	const struct xt_DSCP_info *info = par->targinfo;
 
-	if (dscp > XT_DSCP_MAX) {
-		printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
+	if (info->dscp > XT_DSCP_MAX) {
+		printk(KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp);
 		return false;
 	}
 	return true;
@@ -95,12 +92,10 @@ tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-tos_tg_check_v0(const char *tablename, const void *e_void,
-                const struct xt_target *target, void *targinfo,
-                unsigned int hook_mask)
+static bool tos_tg_check_v0(const struct xt_tgchk_param *par)
 {
-	const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
+	const struct ipt_tos_target_info *info = par->targinfo;
+	const uint8_t tos = info->tos;
 
 	if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT &&
 	    tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST &&
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index 27d03f39611..123ee0ba78c 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -66,12 +66,9 @@ mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-mark_tg_check_v0(const char *tablename, const void *entry,
-                 const struct xt_target *target, void *targinfo,
-                 unsigned int hook_mask)
+static bool mark_tg_check_v0(const struct xt_tgchk_param *par)
 {
-	const struct xt_mark_target_info *markinfo = targinfo;
+	const struct xt_mark_target_info *markinfo = par->targinfo;
 
 	if (markinfo->mark > 0xffffffff) {
 		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
@@ -80,12 +77,9 @@ mark_tg_check_v0(const char *tablename, const void *entry,
 	return true;
 }
 
-static bool
-mark_tg_check_v1(const char *tablename, const void *entry,
-                 const struct xt_target *target, void *targinfo,
-                 unsigned int hook_mask)
+static bool mark_tg_check_v1(const struct xt_tgchk_param *par)
 {
-	const struct xt_mark_target_info_v1 *markinfo = targinfo;
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
 
 	if (markinfo->mode != XT_MARK_SET
 	    && markinfo->mode != XT_MARK_AND
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 3218ad63bd1..56ee4f118b5 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -36,12 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-nflog_tg_check(const char *tablename, const void *entry,
-               const struct xt_target *target, void *targetinfo,
-               unsigned int hookmask)
+static bool nflog_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct xt_nflog_info *info = targetinfo;
+	const struct xt_nflog_info *info = par->targinfo;
 
 	if (info->flags & ~XT_NFLOG_MASK)
 		return false;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 92e33524f78..edf4ab1f30f 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -84,14 +84,9 @@ xt_rateest_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-xt_rateest_tg_checkentry(const char *tablename,
-			 const void *entry,
-			 const struct xt_target *target,
-			 void *targinfo,
-			 unsigned int hook_mask)
+static bool xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 {
-	struct xt_rateest_target_info *info = targinfo;
+	struct xt_rateest_target_info *info = par->targinfo;
 	struct xt_rateest *est;
 	struct {
 		struct nlattr		opt;
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index ad05214e380..e5777227192 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -80,16 +80,14 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info)
 	return true;
 }
 
-static bool
-secmark_tg_check(const char *tablename, const void *entry,
-                 const struct xt_target *target, void *targinfo,
-                 unsigned int hook_mask)
+static bool secmark_tg_check(const struct xt_tgchk_param *par)
 {
-	struct xt_secmark_target_info *info = targinfo;
+	struct xt_secmark_target_info *info = par->targinfo;
 
-	if (strcmp(tablename, "mangle") && strcmp(tablename, "security")) {
+	if (strcmp(par->table, "mangle") != 0 &&
+	    strcmp(par->table, "security") != 0) {
 		printk(KERN_INFO PFX "target only valid in the \'mangle\' "
-		       "or \'security\' tables, not \'%s\'.\n", tablename);
+		       "or \'security\' tables, not \'%s\'.\n", par->table);
 		return false;
 	}
 
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index e08762d9b0f..4f3b1f80879 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -237,16 +237,13 @@ static inline bool find_syn_match(const struct xt_entry_match *m)
 	return false;
 }
 
-static bool
-tcpmss_tg4_check(const char *tablename, const void *entry,
-                 const struct xt_target *target, void *targinfo,
-                 unsigned int hook_mask)
+static bool tcpmss_tg4_check(const struct xt_tgchk_param *par)
 {
-	const struct xt_tcpmss_info *info = targinfo;
-	const struct ipt_entry *e = entry;
+	const struct xt_tcpmss_info *info = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
 
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
-	    (hook_mask & ~((1 << NF_INET_FORWARD) |
+	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
 		printk("xt_TCPMSS: path-MTU clamping only supported in "
@@ -260,16 +257,13 @@ tcpmss_tg4_check(const char *tablename, const void *entry,
 }
 
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-static bool
-tcpmss_tg6_check(const char *tablename, const void *entry,
-                 const struct xt_target *target, void *targinfo,
-                 unsigned int hook_mask)
+static bool tcpmss_tg6_check(const struct xt_tgchk_param *par)
 {
-	const struct xt_tcpmss_info *info = targinfo;
-	const struct ip6t_entry *e = entry;
+	const struct xt_tcpmss_info *info = par->targinfo;
+	const struct ip6t_entry *e = par->entryinfo;
 
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
-	    (hook_mask & ~((1 << NF_INET_FORWARD) |
+	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
 		printk("xt_TCPMSS: path-MTU clamping only supported in "
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index f08c49ea4bd..1340c2fa362 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -59,14 +59,9 @@ tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return NF_DROP;
 }
 
-static bool
-tproxy_tg_check(const char *tablename,
-		const void *entry,
-		const struct xt_target *target,
-		void *targetinfo,
-		unsigned int hook_mask)
+static bool tproxy_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct ipt_ip *i = entry;
+	const struct ipt_ip *i = par->entryinfo;
 
 	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
 	    && !(i->invflags & IPT_INV_PROTO))
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 89791a56429..a54dc3f8234 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -40,6 +40,7 @@ static struct tcf_hashinfo ipt_hash_info = {
 
 static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
 {
+	struct xt_tgchk_param par;
 	struct xt_target *target;
 	int ret = 0;
 
@@ -49,9 +50,14 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 		return -ENOENT;
 
 	t->u.kernel.target = target;
-
-	ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
-			      table, hook, 0, 0, NULL, t->data);
+	par.table     = table;
+	par.entryinfo = NULL;
+	par.target    = target;
+	par.targinfo  = t->data;
+	par.hook_mask = hook;
+
+	ret = xt_check_target(&par, NFPROTO_IPV4,
+	      t->u.target_size - sizeof(*t), 0, false);
 	if (ret < 0) {
 		module_put(t->u.kernel.target->me);
 		return ret;
-- 
cgit v1.2.3


From a2df1648ba615dd5908e9a1fa7b2f133fa302487 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:19 +0200
Subject: netfilter: xtables: move extension arguments into compound structure
 (6/6)

This patch does this for target extensions' destroy functions.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebtables.c    | 19 +++++++++++++------
 net/ipv4/netfilter/arp_tables.c    |  9 ++++++---
 net/ipv4/netfilter/ip_tables.c     | 10 +++++++---
 net/ipv4/netfilter/ipt_CLUSTERIP.c |  6 +++---
 net/ipv6/netfilter/ip6_tables.c    | 10 +++++++---
 net/netfilter/xt_CONNMARK.c        |  5 ++---
 net/netfilter/xt_CONNSECMARK.c     |  5 ++---
 net/netfilter/xt_RATEEST.c         |  5 ++---
 net/netfilter/xt_SECMARK.c         |  2 +-
 net/sched/act_ipt.c                | 10 +++++++---
 10 files changed, 50 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index cf823c21c16..29d8061fa15 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -581,18 +581,23 @@ ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i)
 static inline int
 ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i)
 {
+	struct xt_tgdtor_param par;
+
 	if (i && (*i)-- == 0)
 		return 1;
-	if (w->u.watcher->destroy)
-		w->u.watcher->destroy(w->u.watcher, w->data);
-	module_put(w->u.watcher->me);
 
+	par.target   = w->u.watcher;
+	par.targinfo = w->data;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
 	return 0;
 }
 
 static inline int
 ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt)
 {
+	struct xt_tgdtor_param par;
 	struct ebt_entry_target *t;
 
 	if (e->bitmask == 0)
@@ -603,10 +608,12 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt)
 	EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL);
 	EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL);
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
-	if (t->u.target->destroy)
-		t->u.target->destroy(t->u.target, t->data);
-	module_put(t->u.target->me);
 
+	par.target   = t->u.target;
+	par.targinfo = t->data;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b3238d0101c..3bab78330cf 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -557,15 +557,18 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 
 static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
 {
+	struct xt_tgdtor_param par;
 	struct arpt_entry_target *t;
 
 	if (i && (*i)-- == 0)
 		return 1;
 
 	t = arpt_get_target(e);
-	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
-	module_put(t->u.kernel.target->me);
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e592c54d499..50b9a6c34c3 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -768,6 +768,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 static int
 cleanup_entry(struct ipt_entry *e, unsigned int *i)
 {
+	struct xt_tgdtor_param par;
 	struct ipt_entry_target *t;
 
 	if (i && (*i)-- == 0)
@@ -776,9 +777,12 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i)
 	/* Cleanup all matches */
 	IPT_MATCH_ITERATE(e, cleanup_match, NULL);
 	t = ipt_get_target(e);
-	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
-	module_put(t->u.kernel.target->me);
+
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6c7254e0256..7ac1677419a 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -411,9 +411,9 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
 }
 
 /* drop reference count of cluster config when rule is deleted */
-static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo)
+static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
 {
-	const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+	const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
 
 	/* if no more entries are referencing the config, remove it
 	 * from the list and destroy the proc entry */
@@ -421,7 +421,7 @@ static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo)
 
 	clusterip_config_put(cipinfo->config);
 
-	nf_ct_l3proto_module_put(target->family);
+	nf_ct_l3proto_module_put(par->target->family);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ca14fb8bd36..d934a699463 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -793,6 +793,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 static int
 cleanup_entry(struct ip6t_entry *e, unsigned int *i)
 {
+	struct xt_tgdtor_param par;
 	struct ip6t_entry_target *t;
 
 	if (i && (*i)-- == 0)
@@ -801,9 +802,12 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i)
 	/* Cleanup all matches */
 	IP6T_MATCH_ITERATE(e, cleanup_match, NULL);
 	t = ip6t_get_target(e);
-	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
-	module_put(t->u.kernel.target->me);
+
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
 	return 0;
 }
 
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index 8fc9f35e67d..c5a5072e005 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -146,10 +146,9 @@ static bool connmark_tg_check(const struct xt_tgchk_param *par)
 	return true;
 }
 
-static void
-connmark_tg_destroy(const struct xt_target *target, void *targinfo)
+static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
 {
-	nf_ct_l3proto_module_put(target->family);
+	nf_ct_l3proto_module_put(par->target->family);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 2041a3d4b4d..b6e3f3f125f 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -114,10 +114,9 @@ static bool connsecmark_tg_check(const struct xt_tgchk_param *par)
 	return true;
 }
 
-static void
-connsecmark_tg_destroy(const struct xt_target *target, void *targinfo)
+static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
 {
-	nf_ct_l3proto_module_put(target->family);
+	nf_ct_l3proto_module_put(par->target->family);
 }
 
 static struct xt_target connsecmark_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index edf4ab1f30f..43f5676b1af 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -139,10 +139,9 @@ err1:
 	return false;
 }
 
-static void xt_rateest_tg_destroy(const struct xt_target *target,
-				  void *targinfo)
+static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par)
 {
-	struct xt_rateest_target_info *info = targinfo;
+	struct xt_rateest_target_info *info = par->targinfo;
 
 	xt_rateest_put(info->est);
 }
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index e5777227192..7a6f9e6f5df 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -113,7 +113,7 @@ static bool secmark_tg_check(const struct xt_tgchk_param *par)
 	return true;
 }
 
-static void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
+static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
 {
 	switch (mode) {
 	case SECMARK_MODE_SEL:
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index a54dc3f8234..b951d422db9 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -67,9 +67,13 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 
 static void ipt_destroy_target(struct ipt_entry_target *t)
 {
-	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
-	module_put(t->u.kernel.target->me);
+	struct xt_tgdtor_param par = {
+		.target   = t->u.kernel.target,
+		.targinfo = t->data,
+	};
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
 }
 
 static int tcf_ipt_release(struct tcf_ipt *ipt, int bind)
-- 
cgit v1.2.3


From 916a917dfec18535ff9e2afdafba82e6279eb4f4 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:20 +0200
Subject: netfilter: xtables: provide invoked family value to extensions

By passing in the family through which extensions were invoked, a bit
of data space can be reclaimed. The "family" member will be added to
the parameter structures and the check functions be adjusted.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/netfilter/ebtables.c | 11 ++++++++---
 net/ipv4/netfilter/arp_tables.c |  6 ++++--
 net/ipv4/netfilter/ip_tables.c  | 10 ++++++++--
 net/ipv6/netfilter/ip6_tables.c | 10 ++++++++--
 net/netfilter/x_tables.c        | 23 ++++++++++++-----------
 net/sched/act_ipt.c             |  4 ++--
 6 files changed, 42 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 29d8061fa15..5bb88eb0aad 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -160,6 +160,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
 
+	mtpar.family  = tgpar.family = NFPROTO_BRIDGE;
 	mtpar.in      = tgpar.in  = in;
 	mtpar.out     = tgpar.out = out;
 	mtpar.hotdrop = &hotdrop;
@@ -351,7 +352,7 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
 
 	par->match     = match;
 	par->matchinfo = m->data;
-	ret = xt_check_match(par, NFPROTO_BRIDGE, m->match_size,
+	ret = xt_check_match(par, m->match_size,
 	      e->ethproto, e->invflags & EBT_IPROTO);
 	if (ret < 0) {
 		module_put(match->me);
@@ -386,7 +387,7 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par,
 
 	par->target   = watcher;
 	par->targinfo = w->data;
-	ret = xt_check_target(par, NFPROTO_BRIDGE, w->watcher_size,
+	ret = xt_check_target(par, w->watcher_size,
 	      e->ethproto, e->invflags & EBT_IPROTO);
 	if (ret < 0) {
 		module_put(watcher->me);
@@ -572,6 +573,7 @@ ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i)
 
 	par.match     = m->u.match;
 	par.matchinfo = m->data;
+	par.family    = NFPROTO_BRIDGE;
 	if (par.match->destroy != NULL)
 		par.match->destroy(&par);
 	module_put(par.match->me);
@@ -588,6 +590,7 @@ ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i)
 
 	par.target   = w->u.watcher;
 	par.targinfo = w->data;
+	par.family   = NFPROTO_BRIDGE;
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
@@ -611,6 +614,7 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt)
 
 	par.target   = t->u.target;
 	par.targinfo = t->data;
+	par.family   = NFPROTO_BRIDGE;
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
@@ -673,6 +677,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	mtpar.table     = tgpar.table     = name;
 	mtpar.entryinfo = tgpar.entryinfo = e;
 	mtpar.hook_mask = tgpar.hook_mask = hookmask;
+	mtpar.family    = tgpar.family    = NFPROTO_BRIDGE;
 	ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i);
 	if (ret != 0)
 		goto cleanup_matches;
@@ -715,7 +720,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 
 	tgpar.target   = target;
 	tgpar.targinfo = t->data;
-	ret = xt_check_target(&tgpar, NFPROTO_BRIDGE, t->target_size,
+	ret = xt_check_target(&tgpar, t->target_size,
 	      e->ethproto, e->invflags & EBT_IPROTO);
 	if (ret < 0) {
 		module_put(target->me);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3bab78330cf..8d70d29f1cc 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -246,6 +246,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	tgpar.in      = in;
 	tgpar.out     = out;
 	tgpar.hooknum = hook;
+	tgpar.family  = NFPROTO_ARP;
 
 	arp = arp_hdr(skb);
 	do {
@@ -465,10 +466,10 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 		.target    = t->u.kernel.target,
 		.targinfo  = t->data,
 		.hook_mask = e->comefrom,
+		.family    = NFPROTO_ARP,
 	};
 
-	ret = xt_check_target(&par, NFPROTO_ARP,
-	      t->u.target_size - sizeof(*t), 0, false);
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
 	if (ret < 0) {
 		duprintf("arp_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
@@ -566,6 +567,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
 	t = arpt_get_target(e);
 	par.target   = t->u.kernel.target;
 	par.targinfo = t->data;
+	par.family   = NFPROTO_ARP;
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 50b9a6c34c3..213fb27debc 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -348,6 +348,7 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.hotdrop = &hotdrop;
 	mtpar.in      = tgpar.in  = in;
 	mtpar.out     = tgpar.out = out;
+	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
 	read_lock_bh(&table->lock);
@@ -579,6 +580,7 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i)
 
 	par.match     = m->u.kernel.match;
 	par.matchinfo = m->data;
+	par.family    = NFPROTO_IPV4;
 	if (par.match->destroy != NULL)
 		par.match->destroy(&par);
 	module_put(par.match->me);
@@ -616,7 +618,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
 	par->match     = m->u.kernel.match;
 	par->matchinfo = m->data;
 
-	ret = xt_check_match(par, NFPROTO_IPV4, m->u.match_size - sizeof(*m),
+	ret = xt_check_match(par, m->u.match_size - sizeof(*m),
 	      ip->proto, ip->invflags & IPT_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -662,10 +664,11 @@ static int check_target(struct ipt_entry *e, const char *name)
 		.target    = t->u.kernel.target,
 		.targinfo  = t->data,
 		.hook_mask = e->comefrom,
+		.family    = NFPROTO_IPV4,
 	};
 	int ret;
 
-	ret = xt_check_target(&par, NFPROTO_IPV4, t->u.target_size - sizeof(*t),
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
 	      e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -693,6 +696,7 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
 	mtpar.table     = name;
 	mtpar.entryinfo = &e->ip;
 	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV4;
 	ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j);
 	if (ret != 0)
 		goto cleanup_matches;
@@ -780,6 +784,7 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i)
 
 	par.target   = t->u.kernel.target;
 	par.targinfo = t->data;
+	par.family   = NFPROTO_IPV4;
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
@@ -1659,6 +1664,7 @@ compat_check_entry(struct ipt_entry *e, const char *name,
 	mtpar.table     = name;
 	mtpar.entryinfo = &e->ip;
 	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV4;
 	ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j);
 	if (ret)
 		goto cleanup_matches;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d934a699463..a33485dc81c 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -370,6 +370,7 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.hotdrop = &hotdrop;
 	mtpar.in      = tgpar.in  = in;
 	mtpar.out     = tgpar.out = out;
+	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
 	read_lock_bh(&table->lock);
@@ -604,6 +605,7 @@ cleanup_match(struct ip6t_entry_match *m, unsigned int *i)
 
 	par.match     = m->u.kernel.match;
 	par.matchinfo = m->data;
+	par.family    = NFPROTO_IPV6;
 	if (par.match->destroy != NULL)
 		par.match->destroy(&par);
 	module_put(par.match->me);
@@ -640,7 +642,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par,
 	par->match     = m->u.kernel.match;
 	par->matchinfo = m->data;
 
-	ret = xt_check_match(par, NFPROTO_IPV6, m->u.match_size - sizeof(*m),
+	ret = xt_check_match(par, m->u.match_size - sizeof(*m),
 			     ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -686,11 +688,12 @@ static int check_target(struct ip6t_entry *e, const char *name)
 		.target    = t->u.kernel.target,
 		.targinfo  = t->data,
 		.hook_mask = e->comefrom,
+		.family    = NFPROTO_IPV6,
 	};
 	int ret;
 
 	t = ip6t_get_target(e);
-	ret = xt_check_target(&par, NFPROTO_IPV6, t->u.target_size - sizeof(*t),
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
 	      e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO);
 	if (ret < 0) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -718,6 +721,7 @@ find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
 	mtpar.table     = name;
 	mtpar.entryinfo = &e->ipv6;
 	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV6;
 	ret = IP6T_MATCH_ITERATE(e, find_check_match, &mtpar, &j);
 	if (ret != 0)
 		goto cleanup_matches;
@@ -805,6 +809,7 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i)
 
 	par.target   = t->u.kernel.target;
 	par.targinfo = t->data;
+	par.family   = NFPROTO_IPV6;
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
@@ -1685,6 +1690,7 @@ static int compat_check_entry(struct ip6t_entry *e, const char *name,
 	mtpar.table     = name;
 	mtpar.entryinfo = &e->ipv6;
 	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV6;
 	ret = IP6T_MATCH_ITERATE(e, check_match, &mtpar, &j);
 	if (ret)
 		goto cleanup_matches;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f29513cd139..89837a4eef7 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -321,7 +321,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
 }
 EXPORT_SYMBOL_GPL(xt_find_revision);
 
-int xt_check_match(struct xt_mtchk_param *par, u_int8_t family,
+int xt_check_match(struct xt_mtchk_param *par,
 		   unsigned int size, u_int8_t proto, bool inv_proto)
 {
 	if (XT_ALIGN(par->match->matchsize) != size &&
@@ -331,26 +331,27 @@ int xt_check_match(struct xt_mtchk_param *par, u_int8_t family,
 		 * because it uses a dynamic-size data set.
 		 */
 		printk("%s_tables: %s match: invalid size %Zu != %u\n",
-		       xt_prefix[family], par->match->name,
+		       xt_prefix[par->family], par->match->name,
 		       XT_ALIGN(par->match->matchsize), size);
 		return -EINVAL;
 	}
 	if (par->match->table != NULL &&
 	    strcmp(par->match->table, par->table) != 0) {
 		printk("%s_tables: %s match: only valid in %s table, not %s\n",
-		       xt_prefix[family], par->match->name,
+		       xt_prefix[par->family], par->match->name,
 		       par->match->table, par->table);
 		return -EINVAL;
 	}
 	if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
 		printk("%s_tables: %s match: bad hook_mask %#x/%#x\n",
-		       xt_prefix[family], par->match->name,
+		       xt_prefix[par->family], par->match->name,
 		       par->hook_mask, par->match->hooks);
 		return -EINVAL;
 	}
 	if (par->match->proto && (par->match->proto != proto || inv_proto)) {
 		printk("%s_tables: %s match: only valid for protocol %u\n",
-		       xt_prefix[family], par->match->name, par->match->proto);
+		       xt_prefix[par->family], par->match->name,
+		       par->match->proto);
 		return -EINVAL;
 	}
 	if (par->match->checkentry != NULL && !par->match->checkentry(par))
@@ -471,31 +472,31 @@ int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr,
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
 #endif /* CONFIG_COMPAT */
 
-int xt_check_target(struct xt_tgchk_param *par, u_int8_t family,
+int xt_check_target(struct xt_tgchk_param *par,
 		    unsigned int size, u_int8_t proto, bool inv_proto)
 {
 	if (XT_ALIGN(par->target->targetsize) != size) {
 		printk("%s_tables: %s target: invalid size %Zu != %u\n",
-		       xt_prefix[family], par->target->name,
+		       xt_prefix[par->family], par->target->name,
 		       XT_ALIGN(par->target->targetsize), size);
 		return -EINVAL;
 	}
 	if (par->target->table != NULL &&
 	    strcmp(par->target->table, par->table) != 0) {
 		printk("%s_tables: %s target: only valid in %s table, not %s\n",
-		       xt_prefix[family], par->target->name,
+		       xt_prefix[par->family], par->target->name,
 		       par->target->table, par->table);
 		return -EINVAL;
 	}
 	if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
 		printk("%s_tables: %s target: bad hook_mask %#x/%#x\n",
-		       xt_prefix[family], par->target->name, par->hook_mask,
-		       par->target->hooks);
+		       xt_prefix[par->family], par->target->name,
+		       par->hook_mask, par->target->hooks);
 		return -EINVAL;
 	}
 	if (par->target->proto && (par->target->proto != proto || inv_proto)) {
 		printk("%s_tables: %s target: only valid for protocol %u\n",
-		       xt_prefix[family], par->target->name,
+		       xt_prefix[par->family], par->target->name,
 		       par->target->proto);
 		return -EINVAL;
 	}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index b951d422db9..0453d79ebf5 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -55,9 +55,9 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 	par.target    = target;
 	par.targinfo  = t->data;
 	par.hook_mask = hook;
+	par.family    = NFPROTO_IPV4;
 
-	ret = xt_check_target(&par, NFPROTO_IPV4,
-	      t->u.target_size - sizeof(*t), 0, false);
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
 	if (ret < 0) {
 		module_put(t->u.kernel.target->me);
 		return ret;
-- 
cgit v1.2.3


From 92f3b2b1bc968caaabee8cd78bee75ab7c4af74e Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:20 +0200
Subject: netfilter: xtables: cut down on static data for family-independent
 extensions

Using ->family in struct xt_*_param, multiple struct xt_{match,target}
can be squashed together.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_CONNMARK.c    | 39 +++++++----------------------------
 net/netfilter/xt_CONNSECMARK.c | 40 +++++++++++++----------------------
 net/netfilter/xt_NFLOG.c       | 31 ++++++++++------------------
 net/netfilter/xt_connbytes.c   | 39 +++++++++++++----------------------
 net/netfilter/xt_connlimit.c   | 47 ++++++++++++++++--------------------------
 net/netfilter/xt_connmark.c    | 39 +++++++----------------------------
 net/netfilter/xt_conntrack.c   | 26 +++++++----------------
 net/netfilter/xt_helper.c      | 38 +++++++++++++---------------------
 net/netfilter/xt_pkttype.c     | 30 ++++++++++-----------------
 9 files changed, 104 insertions(+), 225 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index c5a5072e005..d6e5ab46327 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -128,9 +128,9 @@ static bool connmark_tg_check_v0(const struct xt_tgchk_param *par)
 		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
 		return false;
 	}
-	if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", par->target->family);
+				    "proto=%u\n", par->family);
 		return false;
 	}
 	return true;
@@ -138,9 +138,9 @@ static bool connmark_tg_check_v0(const struct xt_tgchk_param *par)
 
 static bool connmark_tg_check(const struct xt_tgchk_param *par)
 {
-	if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "cannot load conntrack support for "
-		       "proto=%u\n", par->target->family);
+		       "proto=%u\n", par->family);
 		return false;
 	}
 	return true;
@@ -148,7 +148,7 @@ static bool connmark_tg_check(const struct xt_tgchk_param *par)
 
 static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
 {
-	nf_ct_l3proto_module_put(par->target->family);
+	nf_ct_l3proto_module_put(par->family);
 }
 
 #ifdef CONFIG_COMPAT
@@ -186,7 +186,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
 	{
 		.name		= "CONNMARK",
 		.revision	= 0,
-		.family		= NFPROTO_IPV4,
+		.family		= NFPROTO_UNSPEC,
 		.checkentry	= connmark_tg_check_v0,
 		.destroy	= connmark_tg_destroy,
 		.target		= connmark_tg_v0,
@@ -198,35 +198,10 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
 #endif
 		.me		= THIS_MODULE
 	},
-	{
-		.name		= "CONNMARK",
-		.revision	= 0,
-		.family		= NFPROTO_IPV6,
-		.checkentry	= connmark_tg_check_v0,
-		.destroy	= connmark_tg_destroy,
-		.target		= connmark_tg_v0,
-		.targetsize	= sizeof(struct xt_connmark_target_info),
-#ifdef CONFIG_COMPAT
-		.compatsize	= sizeof(struct compat_xt_connmark_target_info),
-		.compat_from_user = connmark_tg_compat_from_user_v0,
-		.compat_to_user	= connmark_tg_compat_to_user_v0,
-#endif
-		.me		= THIS_MODULE
-	},
-	{
-		.name           = "CONNMARK",
-		.revision       = 1,
-		.family         = NFPROTO_IPV4,
-		.checkentry     = connmark_tg_check,
-		.target         = connmark_tg,
-		.targetsize     = sizeof(struct xt_connmark_tginfo1),
-		.destroy        = connmark_tg_destroy,
-		.me             = THIS_MODULE,
-	},
 	{
 		.name           = "CONNMARK",
 		.revision       = 1,
-		.family         = NFPROTO_IPV6,
+		.family         = NFPROTO_UNSPEC,
 		.checkentry     = connmark_tg_check,
 		.target         = connmark_tg,
 		.targetsize     = sizeof(struct xt_connmark_tginfo1),
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index b6e3f3f125f..b54c3756fdc 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -106,9 +106,9 @@ static bool connsecmark_tg_check(const struct xt_tgchk_param *par)
 		return false;
 	}
 
-	if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", par->target->family);
+				    "proto=%u\n", par->family);
 		return false;
 	}
 	return true;
@@ -116,40 +116,28 @@ static bool connsecmark_tg_check(const struct xt_tgchk_param *par)
 
 static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
 {
-	nf_ct_l3proto_module_put(par->target->family);
+	nf_ct_l3proto_module_put(par->family);
 }
 
-static struct xt_target connsecmark_tg_reg[] __read_mostly = {
-	{
-		.name		= "CONNSECMARK",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= connsecmark_tg_check,
-		.destroy	= connsecmark_tg_destroy,
-		.target		= connsecmark_tg,
-		.targetsize	= sizeof(struct xt_connsecmark_target_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "CONNSECMARK",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= connsecmark_tg_check,
-		.destroy	= connsecmark_tg_destroy,
-		.target		= connsecmark_tg,
-		.targetsize	= sizeof(struct xt_connsecmark_target_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_target connsecmark_tg_reg __read_mostly = {
+	.name       = "CONNSECMARK",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = connsecmark_tg_check,
+	.destroy    = connsecmark_tg_destroy,
+	.target     = connsecmark_tg,
+	.targetsize = sizeof(struct xt_connsecmark_target_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init connsecmark_tg_init(void)
 {
-	return xt_register_targets(connsecmark_tg_reg,
-	       ARRAY_SIZE(connsecmark_tg_reg));
+	return xt_register_target(&connsecmark_tg_reg);
 }
 
 static void __exit connsecmark_tg_exit(void)
 {
-	xt_unregister_targets(connsecmark_tg_reg,
-	                      ARRAY_SIZE(connsecmark_tg_reg));
+	xt_unregister_target(&connsecmark_tg_reg);
 }
 
 module_init(connsecmark_tg_init);
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 56ee4f118b5..50e3a52d3b3 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -31,7 +31,7 @@ nflog_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	li.u.ulog.group	     = info->group;
 	li.u.ulog.qthreshold = info->threshold;
 
-	nf_log_packet(par->target->family, par->hooknum, skb, par->in,
+	nf_log_packet(par->family, par->hooknum, skb, par->in,
 	              par->out, &li, "%s", info->prefix);
 	return XT_CONTINUE;
 }
@@ -47,33 +47,24 @@ static bool nflog_tg_check(const struct xt_tgchk_param *par)
 	return true;
 }
 
-static struct xt_target nflog_tg_reg[] __read_mostly = {
-	{
-		.name		= "NFLOG",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= nflog_tg_check,
-		.target		= nflog_tg,
-		.targetsize	= sizeof(struct xt_nflog_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "NFLOG",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= nflog_tg_check,
-		.target		= nflog_tg,
-		.targetsize	= sizeof(struct xt_nflog_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_target nflog_tg_reg __read_mostly = {
+	.name       = "NFLOG",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = nflog_tg_check,
+	.target     = nflog_tg,
+	.targetsize = sizeof(struct xt_nflog_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init nflog_tg_init(void)
 {
-	return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
+	return xt_register_target(&nflog_tg_reg);
 }
 
 static void __exit nflog_tg_exit(void)
 {
-	xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
+	xt_unregister_target(&nflog_tg_reg);
 }
 
 module_init(nflog_tg_init);
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 5bf4aa08b0f..955e6598a7f 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -106,9 +106,9 @@ static bool connbytes_mt_check(const struct xt_mtchk_param *par)
 	    sinfo->direction != XT_CONNBYTES_DIR_BOTH)
 		return false;
 
-	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", par->match->family);
+				    "proto=%u\n", par->family);
 		return false;
 	}
 
@@ -117,39 +117,28 @@ static bool connbytes_mt_check(const struct xt_mtchk_param *par)
 
 static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(par->match->family);
+	nf_ct_l3proto_module_put(par->family);
 }
 
-static struct xt_match connbytes_mt_reg[] __read_mostly = {
-	{
-		.name		= "connbytes",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= connbytes_mt_check,
-		.match		= connbytes_mt,
-		.destroy	= connbytes_mt_destroy,
-		.matchsize	= sizeof(struct xt_connbytes_info),
-		.me		= THIS_MODULE
-	},
-	{
-		.name		= "connbytes",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= connbytes_mt_check,
-		.match		= connbytes_mt,
-		.destroy	= connbytes_mt_destroy,
-		.matchsize	= sizeof(struct xt_connbytes_info),
-		.me		= THIS_MODULE
-	},
+static struct xt_match connbytes_mt_reg __read_mostly = {
+	.name       = "connbytes",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = connbytes_mt_check,
+	.match      = connbytes_mt,
+	.destroy    = connbytes_mt_destroy,
+	.matchsize  = sizeof(struct xt_connbytes_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init connbytes_mt_init(void)
 {
-	return xt_register_matches(connbytes_mt_reg,
-	       ARRAY_SIZE(connbytes_mt_reg));
+	return xt_register_match(&connbytes_mt_reg);
 }
 
 static void __exit connbytes_mt_exit(void)
 {
-	xt_unregister_matches(connbytes_mt_reg, ARRAY_SIZE(connbytes_mt_reg));
+	xt_unregister_match(&connbytes_mt_reg);
 }
 
 module_init(connbytes_mt_init);
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index bfb3ee6c712..7f404cc64c8 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -192,10 +192,10 @@ connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (ct != NULL)
 		tuple_ptr = &ct->tuplehash[0].tuple;
 	else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
-				    par->match->family, &tuple))
+				    par->family, &tuple))
 		goto hotdrop;
 
-	if (par->match->family == NFPROTO_IPV6) {
+	if (par->family == NFPROTO_IPV6) {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
 		memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr));
 	} else {
@@ -226,16 +226,16 @@ static bool connlimit_mt_check(const struct xt_mtchk_param *par)
 	struct xt_connlimit_info *info = par->matchinfo;
 	unsigned int i;
 
-	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "cannot load conntrack support for "
-		       "address family %u\n", par->match->family);
+		       "address family %u\n", par->family);
 		return false;
 	}
 
 	/* init private data */
 	info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
 	if (info->data == NULL) {
-		nf_ct_l3proto_module_put(par->match->family);
+		nf_ct_l3proto_module_put(par->family);
 		return false;
 	}
 
@@ -254,7 +254,7 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
 	struct list_head *hash = info->data->iphash;
 	unsigned int i;
 
-	nf_ct_l3proto_module_put(par->match->family);
+	nf_ct_l3proto_module_put(par->family);
 
 	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) {
 		list_for_each_entry_safe(conn, tmp, &hash[i], list) {
@@ -266,41 +266,30 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
 	kfree(info->data);
 }
 
-static struct xt_match connlimit_mt_reg[] __read_mostly = {
-	{
-		.name       = "connlimit",
-		.family     = NFPROTO_IPV4,
-		.checkentry = connlimit_mt_check,
-		.match      = connlimit_mt,
-		.matchsize  = sizeof(struct xt_connlimit_info),
-		.destroy    = connlimit_mt_destroy,
-		.me         = THIS_MODULE,
-	},
-	{
-		.name       = "connlimit",
-		.family     = NFPROTO_IPV6,
-		.checkentry = connlimit_mt_check,
-		.match      = connlimit_mt,
-		.matchsize  = sizeof(struct xt_connlimit_info),
-		.destroy    = connlimit_mt_destroy,
-		.me         = THIS_MODULE,
-	},
+static struct xt_match connlimit_mt_reg __read_mostly = {
+	.name       = "connlimit",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = connlimit_mt_check,
+	.match      = connlimit_mt,
+	.matchsize  = sizeof(struct xt_connlimit_info),
+	.destroy    = connlimit_mt_destroy,
+	.me         = THIS_MODULE,
 };
 
 static int __init connlimit_mt_init(void)
 {
-	return xt_register_matches(connlimit_mt_reg,
-	       ARRAY_SIZE(connlimit_mt_reg));
+	return xt_register_match(&connlimit_mt_reg);
 }
 
 static void __exit connlimit_mt_exit(void)
 {
-	xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
+	xt_unregister_match(&connlimit_mt_reg);
 }
 
 module_init(connlimit_mt_init);
 module_exit(connlimit_mt_exit);
-MODULE_AUTHOR("Jan Engelhardt <jengelh@computergmbh.de>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
 MODULE_DESCRIPTION("Xtables: Number of connections matching");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_connlimit");
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index c708577ea1b..86cacab7a4a 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -69,9 +69,9 @@ static bool connmark_mt_check_v0(const struct xt_mtchk_param *par)
 		printk(KERN_WARNING "connmark: only support 32bit mark\n");
 		return false;
 	}
-	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", par->match->family);
+				    "proto=%u\n", par->family);
 		return false;
 	}
 	return true;
@@ -79,9 +79,9 @@ static bool connmark_mt_check_v0(const struct xt_mtchk_param *par)
 
 static bool connmark_mt_check(const struct xt_mtchk_param *par)
 {
-	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "cannot load conntrack support for "
-		       "proto=%u\n", par->match->family);
+		       "proto=%u\n", par->family);
 		return false;
 	}
 	return true;
@@ -89,7 +89,7 @@ static bool connmark_mt_check(const struct xt_mtchk_param *par)
 
 static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(par->match->family);
+	nf_ct_l3proto_module_put(par->family);
 }
 
 #ifdef CONFIG_COMPAT
@@ -127,7 +127,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = {
 	{
 		.name		= "connmark",
 		.revision	= 0,
-		.family		= NFPROTO_IPV4,
+		.family		= NFPROTO_UNSPEC,
 		.checkentry	= connmark_mt_check_v0,
 		.match		= connmark_mt_v0,
 		.destroy	= connmark_mt_destroy,
@@ -139,35 +139,10 @@ static struct xt_match connmark_mt_reg[] __read_mostly = {
 #endif
 		.me		= THIS_MODULE
 	},
-	{
-		.name		= "connmark",
-		.revision	= 0,
-		.family		= NFPROTO_IPV6,
-		.checkentry	= connmark_mt_check_v0,
-		.match		= connmark_mt_v0,
-		.destroy	= connmark_mt_destroy,
-		.matchsize	= sizeof(struct xt_connmark_info),
-#ifdef CONFIG_COMPAT
-		.compatsize	= sizeof(struct compat_xt_connmark_info),
-		.compat_from_user = connmark_mt_compat_from_user_v0,
-		.compat_to_user	= connmark_mt_compat_to_user_v0,
-#endif
-		.me		= THIS_MODULE
-	},
-	{
-		.name           = "connmark",
-		.revision       = 1,
-		.family         = NFPROTO_IPV4,
-		.checkentry     = connmark_mt_check,
-		.match          = connmark_mt,
-		.matchsize      = sizeof(struct xt_connmark_mtinfo1),
-		.destroy        = connmark_mt_destroy,
-		.me             = THIS_MODULE,
-	},
 	{
 		.name           = "connmark",
 		.revision       = 1,
-		.family         = NFPROTO_IPV6,
+		.family         = NFPROTO_UNSPEC,
 		.checkentry     = connmark_mt_check,
 		.match          = connmark_mt,
 		.matchsize      = sizeof(struct xt_connmark_mtinfo1),
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 5cd58d7fcb1..0b7139f3dd7 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -238,22 +238,22 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 		return false;
 
 	if (info->match_flags & XT_CONNTRACK_ORIGSRC)
-		if (conntrack_mt_origsrc(ct, info, par->match->family) ^
+		if (conntrack_mt_origsrc(ct, info, par->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_ORIGSRC))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_ORIGDST)
-		if (conntrack_mt_origdst(ct, info, par->match->family) ^
+		if (conntrack_mt_origdst(ct, info, par->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_ORIGDST))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_REPLSRC)
-		if (conntrack_mt_replsrc(ct, info, par->match->family) ^
+		if (conntrack_mt_replsrc(ct, info, par->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_REPLSRC))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_REPLDST)
-		if (conntrack_mt_repldst(ct, info, par->match->family) ^
+		if (conntrack_mt_repldst(ct, info, par->family) ^
 		    !(info->invert_flags & XT_CONNTRACK_REPLDST))
 			return false;
 
@@ -280,9 +280,9 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 
 static bool conntrack_mt_check(const struct xt_mtchk_param *par)
 {
-	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", par->match->family);
+				    "proto=%u\n", par->family);
 		return false;
 	}
 	return true;
@@ -290,7 +290,7 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par)
 
 static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(par->match->family);
+	nf_ct_l3proto_module_put(par->family);
 }
 
 #ifdef CONFIG_COMPAT
@@ -361,17 +361,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
 	{
 		.name       = "conntrack",
 		.revision   = 1,
-		.family     = NFPROTO_IPV4,
-		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
-		.match      = conntrack_mt,
-		.checkentry = conntrack_mt_check,
-		.destroy    = conntrack_mt_destroy,
-		.me         = THIS_MODULE,
-	},
-	{
-		.name       = "conntrack",
-		.revision   = 1,
-		.family     = NFPROTO_IPV6,
+		.family     = NFPROTO_UNSPEC,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
 		.match      = conntrack_mt,
 		.checkentry = conntrack_mt_check,
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 280c984349f..64fc7f27722 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -58,9 +58,9 @@ static bool helper_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_helper_info *info = par->matchinfo;
 
-	if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
-				    "proto=%u\n", par->match->family);
+				    "proto=%u\n", par->family);
 		return false;
 	}
 	info->name[29] = '\0';
@@ -69,38 +69,28 @@ static bool helper_mt_check(const struct xt_mtchk_param *par)
 
 static void helper_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	nf_ct_l3proto_module_put(par->match->family);
+	nf_ct_l3proto_module_put(par->family);
 }
 
-static struct xt_match helper_mt_reg[] __read_mostly = {
-	{
-		.name		= "helper",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= helper_mt_check,
-		.match		= helper_mt,
-		.destroy	= helper_mt_destroy,
-		.matchsize	= sizeof(struct xt_helper_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "helper",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= helper_mt_check,
-		.match		= helper_mt,
-		.destroy	= helper_mt_destroy,
-		.matchsize	= sizeof(struct xt_helper_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_match helper_mt_reg __read_mostly = {
+	.name       = "helper",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = helper_mt_check,
+	.match      = helper_mt,
+	.destroy    = helper_mt_destroy,
+	.matchsize  = sizeof(struct xt_helper_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init helper_mt_init(void)
 {
-	return xt_register_matches(helper_mt_reg, ARRAY_SIZE(helper_mt_reg));
+	return xt_register_match(&helper_mt_reg);
 }
 
 static void __exit helper_mt_exit(void)
 {
-	xt_unregister_matches(helper_mt_reg, ARRAY_SIZE(helper_mt_reg));
+	xt_unregister_match(&helper_mt_reg);
 }
 
 module_init(helper_mt_init);
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 37753a37760..69da1d3a1d8 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -30,10 +30,10 @@ pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 
 	if (skb->pkt_type != PACKET_LOOPBACK)
 		type = skb->pkt_type;
-	else if (par->match->family == NFPROTO_IPV4 &&
+	else if (par->family == NFPROTO_IPV4 &&
 	    ipv4_is_multicast(ip_hdr(skb)->daddr))
 		type = PACKET_MULTICAST;
-	else if (par->match->family == NFPROTO_IPV6 &&
+	else if (par->family == NFPROTO_IPV6 &&
 	    ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
 		type = PACKET_MULTICAST;
 	else
@@ -42,31 +42,23 @@ pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return (type == info->pkttype) ^ info->invert;
 }
 
-static struct xt_match pkttype_mt_reg[] __read_mostly = {
-	{
-		.name		= "pkttype",
-		.family		= NFPROTO_IPV4,
-		.match		= pkttype_mt,
-		.matchsize	= sizeof(struct xt_pkttype_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "pkttype",
-		.family		= NFPROTO_IPV6,
-		.match		= pkttype_mt,
-		.matchsize	= sizeof(struct xt_pkttype_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_match pkttype_mt_reg __read_mostly = {
+	.name      = "pkttype",
+	.revision  = 0,
+	.family    = NFPROTO_UNSPEC,
+	.match     = pkttype_mt,
+	.matchsize = sizeof(struct xt_pkttype_info),
+	.me        = THIS_MODULE,
 };
 
 static int __init pkttype_mt_init(void)
 {
-	return xt_register_matches(pkttype_mt_reg, ARRAY_SIZE(pkttype_mt_reg));
+	return xt_register_match(&pkttype_mt_reg);
 }
 
 static void __exit pkttype_mt_exit(void)
 {
-	xt_unregister_matches(pkttype_mt_reg, ARRAY_SIZE(pkttype_mt_reg));
+	xt_unregister_match(&pkttype_mt_reg);
 }
 
 module_init(pkttype_mt_init);
-- 
cgit v1.2.3


From ab4f21e6fb1c09b13c4c3cb8357babe8223471bd Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:20 +0200
Subject: netfilter: xtables: use NFPROTO_UNSPEC in more extensions

Lots of extensions are completely family-independent, so squash some code.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_MARK.c    | 34 ++--------------------------------
 net/netfilter/xt_NOTRACK.c | 26 +++++++++-----------------
 net/netfilter/xt_comment.c | 26 +++++++++-----------------
 net/netfilter/xt_mac.c     | 34 +++++++++++-----------------------
 net/netfilter/xt_owner.c   | 12 +-----------
 net/netfilter/xt_physdev.c | 29 ++++++++++-------------------
 net/netfilter/xt_realm.c   |  2 +-
 7 files changed, 43 insertions(+), 120 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index 123ee0ba78c..67574bcfb8a 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -149,7 +149,7 @@ static int mark_tg_compat_to_user_v1(void __user *dst, void *src)
 static struct xt_target mark_tg_reg[] __read_mostly = {
 	{
 		.name		= "MARK",
-		.family		= NFPROTO_IPV4,
+		.family		= NFPROTO_UNSPEC,
 		.revision	= 0,
 		.checkentry	= mark_tg_check_v0,
 		.target		= mark_tg_v0,
@@ -164,37 +164,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "MARK",
-		.family		= NFPROTO_IPV4,
-		.revision	= 1,
-		.checkentry	= mark_tg_check_v1,
-		.target		= mark_tg_v1,
-		.targetsize	= sizeof(struct xt_mark_target_info_v1),
-#ifdef CONFIG_COMPAT
-		.compatsize	= sizeof(struct compat_xt_mark_target_info_v1),
-		.compat_from_user = mark_tg_compat_from_user_v1,
-		.compat_to_user	= mark_tg_compat_to_user_v1,
-#endif
-		.table		= "mangle",
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "MARK",
-		.family		= NFPROTO_IPV6,
-		.revision	= 0,
-		.checkentry	= mark_tg_check_v0,
-		.target		= mark_tg_v0,
-		.targetsize	= sizeof(struct xt_mark_target_info),
-#ifdef CONFIG_COMPAT
-		.compatsize	= sizeof(struct compat_xt_mark_target_info),
-		.compat_from_user = mark_tg_compat_from_user_v0,
-		.compat_to_user	= mark_tg_compat_to_user_v0,
-#endif
-		.table		= "mangle",
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "MARK",
-		.family		= NFPROTO_IPV6,
+		.family		= NFPROTO_UNSPEC,
 		.revision	= 1,
 		.checkentry	= mark_tg_check_v1,
 		.target		= mark_tg_v1,
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index cc50295cd11..e7a0a54fd4e 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -30,31 +30,23 @@ notrack_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return XT_CONTINUE;
 }
 
-static struct xt_target notrack_tg_reg[] __read_mostly = {
-	{
-		.name		= "NOTRACK",
-		.family		= NFPROTO_IPV4,
-		.target		= notrack_tg,
-		.table		= "raw",
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "NOTRACK",
-		.family		= NFPROTO_IPV6,
-		.target		= notrack_tg,
-		.table		= "raw",
-		.me		= THIS_MODULE,
-	},
+static struct xt_target notrack_tg_reg __read_mostly = {
+	.name     = "NOTRACK",
+	.revision = 0,
+	.family   = NFPROTO_UNSPEC,
+	.target   = notrack_tg,
+	.table    = "raw",
+	.me       = THIS_MODULE,
 };
 
 static int __init notrack_tg_init(void)
 {
-	return xt_register_targets(notrack_tg_reg, ARRAY_SIZE(notrack_tg_reg));
+	return xt_register_target(&notrack_tg_reg);
 }
 
 static void __exit notrack_tg_exit(void)
 {
-	xt_unregister_targets(notrack_tg_reg, ARRAY_SIZE(notrack_tg_reg));
+	xt_unregister_target(&notrack_tg_reg);
 }
 
 module_init(notrack_tg_init);
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
index bd7aa57af42..e82179832ac 100644
--- a/net/netfilter/xt_comment.c
+++ b/net/netfilter/xt_comment.c
@@ -22,31 +22,23 @@ comment_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
-static struct xt_match comment_mt_reg[] __read_mostly = {
-	{
-		.name		= "comment",
-		.family		= NFPROTO_IPV4,
-		.match		= comment_mt,
-		.matchsize	= sizeof(struct xt_comment_info),
-		.me		= THIS_MODULE
-	},
-	{
-		.name		= "comment",
-		.family		= NFPROTO_IPV6,
-		.match		= comment_mt,
-		.matchsize	= sizeof(struct xt_comment_info),
-		.me		= THIS_MODULE
-	},
+static struct xt_match comment_mt_reg __read_mostly = {
+	.name      = "comment",
+	.revision  = 0,
+	.family    = NFPROTO_UNSPEC,
+	.match     = comment_mt,
+	.matchsize = sizeof(struct xt_comment_info),
+	.me        = THIS_MODULE,
 };
 
 static int __init comment_mt_init(void)
 {
-	return xt_register_matches(comment_mt_reg, ARRAY_SIZE(comment_mt_reg));
+	return xt_register_match(&comment_mt_reg);
 }
 
 static void __exit comment_mt_exit(void)
 {
-	xt_unregister_matches(comment_mt_reg, ARRAY_SIZE(comment_mt_reg));
+	xt_unregister_match(&comment_mt_reg);
 }
 
 module_init(comment_mt_init);
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 269f9d8aef5..c2007116ce5 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -36,37 +36,25 @@ static bool mac_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 		^ info->invert);
 }
 
-static struct xt_match mac_mt_reg[] __read_mostly = {
-	{
-		.name		= "mac",
-		.family		= NFPROTO_IPV4,
-		.match		= mac_mt,
-		.matchsize	= sizeof(struct xt_mac_info),
-		.hooks		= (1 << NF_INET_PRE_ROUTING) |
-				  (1 << NF_INET_LOCAL_IN) |
-				  (1 << NF_INET_FORWARD),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "mac",
-		.family		= NFPROTO_IPV6,
-		.match		= mac_mt,
-		.matchsize	= sizeof(struct xt_mac_info),
-		.hooks		= (1 << NF_INET_PRE_ROUTING) |
-				  (1 << NF_INET_LOCAL_IN) |
-				  (1 << NF_INET_FORWARD),
-		.me		= THIS_MODULE,
-	},
+static struct xt_match mac_mt_reg __read_mostly = {
+	.name      = "mac",
+	.revision  = 0,
+	.family    = NFPROTO_UNSPEC,
+	.match     = mac_mt,
+	.matchsize = sizeof(struct xt_mac_info),
+	.hooks     = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
+	             (1 << NF_INET_FORWARD),
+	.me        = THIS_MODULE,
 };
 
 static int __init mac_mt_init(void)
 {
-	return xt_register_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
+	return xt_register_match(&mac_mt_reg);
 }
 
 static void __exit mac_mt_exit(void)
 {
-	xt_unregister_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
+	xt_unregister_match(&mac_mt_reg);
 }
 
 module_init(mac_mt_init);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 32f84e84d9e..f19ebd9b78f 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -160,17 +160,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = {
 	{
 		.name       = "owner",
 		.revision   = 1,
-		.family     = NFPROTO_IPV4,
-		.match      = owner_mt,
-		.matchsize  = sizeof(struct xt_owner_match_info),
-		.hooks      = (1 << NF_INET_LOCAL_OUT) |
-		              (1 << NF_INET_POST_ROUTING),
-		.me         = THIS_MODULE,
-	},
-	{
-		.name       = "owner",
-		.revision   = 1,
-		.family     = NFPROTO_IPV6,
+		.family     = NFPROTO_UNSPEC,
 		.match      = owner_mt,
 		.matchsize  = sizeof(struct xt_owner_match_info),
 		.hooks      = (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index b01786d2dd9..1bcdfc12cf5 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -112,33 +112,24 @@ static bool physdev_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static struct xt_match physdev_mt_reg[] __read_mostly = {
-	{
-		.name		= "physdev",
-		.family		= NFPROTO_IPV4,
-		.checkentry	= physdev_mt_check,
-		.match		= physdev_mt,
-		.matchsize	= sizeof(struct xt_physdev_info),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "physdev",
-		.family		= NFPROTO_IPV6,
-		.checkentry	= physdev_mt_check,
-		.match		= physdev_mt,
-		.matchsize	= sizeof(struct xt_physdev_info),
-		.me		= THIS_MODULE,
-	},
+static struct xt_match physdev_mt_reg __read_mostly = {
+	.name       = "physdev",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = physdev_mt_check,
+	.match      = physdev_mt,
+	.matchsize  = sizeof(struct xt_physdev_info),
+	.me         = THIS_MODULE,
 };
 
 static int __init physdev_mt_init(void)
 {
-	return xt_register_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
+	return xt_register_match(&physdev_mt_reg);
 }
 
 static void __exit physdev_mt_exit(void)
 {
-	xt_unregister_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
+	xt_unregister_match(&physdev_mt_reg);
 }
 
 module_init(physdev_mt_init);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index b25942110ed..67419287bc7 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -36,7 +36,7 @@ static struct xt_match realm_mt_reg __read_mostly = {
 	.matchsize	= sizeof(struct xt_realm_info),
 	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) |
 			  (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN),
-	.family		= NFPROTO_IPV4,
+	.family		= NFPROTO_UNSPEC,
 	.me		= THIS_MODULE
 };
 
-- 
cgit v1.2.3


From f39a9410ed0503278fd5edc559fa019051413039 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:20 +0200
Subject: netfilter: xtables: remove bogus mangle table dependency of connmark

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 899e78051d8..f70b4145ffc 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -318,7 +318,6 @@ config NETFILTER_XT_TARGET_CLASSIFY
 
 config NETFILTER_XT_TARGET_CONNMARK
 	tristate  '"CONNMARK" target support'
-	depends on IP_NF_MANGLE || IP6_NF_MANGLE
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select NF_CONNTRACK_MARK
-- 
cgit v1.2.3


From 0b0588d42b2774734b51525fe6550d77f8ea9bc0 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:31:18 -0700
Subject: ipv6: local dev is actually unused in ip6_fragment

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3df2c442d90..f22393e1166 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -613,7 +613,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
-	struct net_device *dev;
 	struct sk_buff *frag;
 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
@@ -624,7 +623,6 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	int ptr, offset = 0, err=0;
 	u8 *prevhdr, nexthdr = 0;
 
-	dev = rt->u.dst.dev;
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-- 
cgit v1.2.3


From 98b3377ca77a06a7bd75a444e9f7136e9bb5112e Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:31:44 -0700
Subject: ipv6: consolidate error paths in ipv6_frag_rcv

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 2eeadfa039c..f4f62f08609 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -578,19 +578,12 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS);
 
 	/* Jumbo payload inhibits frag. header */
-	if (hdr->payload_len==0) {
-		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
-		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
-				  skb_network_header_len(skb));
-		return -1;
-	}
+	if (hdr->payload_len==0)
+		goto fail_hdr;
+
 	if (!pskb_may_pull(skb, (skb_transport_offset(skb) +
-				 sizeof(struct frag_hdr)))) {
-		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
-		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
-				  skb_network_header_len(skb));
-		return -1;
-	}
+				 sizeof(struct frag_hdr))))
+		goto fail_hdr;
 
 	hdr = ipv6_hdr(skb);
 	fhdr = (struct frag_hdr *)skb_transport_header(skb);
@@ -624,6 +617,11 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
 	return -1;
+
+fail_hdr:
+	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
+	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
+	return -1;
 }
 
 static struct inet6_protocol frag_protocol =
-- 
cgit v1.2.3


From 3bd653c8455bc7991bae77968702b31c8f5df883 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:54:51 -0700
Subject: netns: add net parameter to IP6_INC_STATS

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c       |  2 +-
 net/ipv6/ip6_output.c | 56 +++++++++++++++++++++++++++++++--------------------
 net/ipv6/mcast.c      | 10 ++++-----
 net/ipv6/ndisc.c      |  4 ++--
 net/ipv6/netfilter.c  |  3 ++-
 net/ipv6/raw.c        |  4 ++--
 net/ipv6/reassembly.c |  8 ++++----
 net/ipv6/route.c      |  7 +++++--
 8 files changed, 55 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index b3157a0cc15..758cdd7dc27 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -183,7 +183,7 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
 	 */
 	dst = ip6_route_output(net, sk, fl);
 	if (dst->error) {
-		IP6_INC_STATS(ip6_dst_idev(dst),
+		IP6_INC_STATS(net, ip6_dst_idev(dst),
 			      IPSTATS_MIB_OUTNOROUTES);
 	} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
 		res = 1;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f22393e1166..db28c208f32 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -150,13 +150,14 @@ static int ip6_output2(struct sk_buff *skb)
 					ip6_dev_loopback_xmit);
 
 			if (ipv6_hdr(skb)->hop_limit == 0) {
-				IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
+				IP6_INC_STATS(dev_net(dev), idev,
+					      IPSTATS_MIB_OUTDISCARDS);
 				kfree_skb(skb);
 				return 0;
 			}
 		}
 
-		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
+		IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
 	}
 
 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
@@ -175,7 +176,8 @@ int ip6_output(struct sk_buff *skb)
 {
 	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 	if (unlikely(idev->cnf.disable_ipv6)) {
-		IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
+		IP6_INC_STATS(dev_net(skb->dst->dev), idev,
+			      IPSTATS_MIB_OUTDISCARDS);
 		kfree_skb(skb);
 		return 0;
 	}
@@ -194,6 +196,7 @@ int ip6_output(struct sk_buff *skb)
 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 	     struct ipv6_txoptions *opt, int ipfragok)
 {
+	struct net *net = sock_net(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *first_hop = &fl->fl6_dst;
 	struct dst_entry *dst = skb->dst;
@@ -216,7 +219,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 		if (skb_headroom(skb) < head_room) {
 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 			if (skb2 == NULL) {
-				IP6_INC_STATS(ip6_dst_idev(skb->dst),
+				IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 					      IPSTATS_MIB_OUTDISCARDS);
 				kfree_skb(skb);
 				return -ENOBUFS;
@@ -270,7 +273,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 
 	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
-		IP6_INC_STATS(ip6_dst_idev(skb->dst),
+		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 			      IPSTATS_MIB_OUTREQUESTS);
 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 				dst_output);
@@ -280,7 +283,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 	skb->dev = dst->dev;
 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
+	IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 	kfree_skb(skb);
 	return -EMSGSIZE;
 }
@@ -422,7 +425,7 @@ int ip6_forward(struct sk_buff *skb)
 		goto drop;
 
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
-		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
 
@@ -468,13 +471,14 @@ int ip6_forward(struct sk_buff *skb)
 		if (proxied > 0)
 			return ip6_input(skb);
 		else if (proxied < 0) {
-			IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+			IP6_INC_STATS(net, ip6_dst_idev(dst),
+				      IPSTATS_MIB_INDISCARDS);
 			goto drop;
 		}
 	}
 
 	if (!xfrm6_route_forward(skb)) {
-		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
 	dst = skb->dst;
@@ -530,7 +534,7 @@ int ip6_forward(struct sk_buff *skb)
 	}
 
 	if (skb_cow(skb, dst->dev->hard_header_len)) {
-		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 		goto drop;
 	}
 
@@ -622,6 +626,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	__be32 frag_id = 0;
 	int ptr, offset = 0, err=0;
 	u8 *prevhdr, nexthdr = 0;
+	struct net *net = dev_net(skb->dst->dev);
 
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
@@ -635,7 +640,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	if (!skb->local_df) {
 		skb->dev = skb->dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
+		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+			      IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -684,7 +690,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		*prevhdr = NEXTHDR_FRAGMENT;
 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 		if (!tmp_hdr) {
-			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
+			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+				      IPSTATS_MIB_FRAGFAILS);
 			return -ENOMEM;
 		}
 
@@ -735,7 +742,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 			err = output(skb);
 			if(!err)
-				IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
+				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
+					      IPSTATS_MIB_FRAGCREATES);
 
 			if (err || !frag)
 				break;
@@ -748,7 +756,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		kfree(tmp_hdr);
 
 		if (err == 0) {
-			IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
+			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
+				      IPSTATS_MIB_FRAGOKS);
 			dst_release(&rt->u.dst);
 			return 0;
 		}
@@ -759,7 +768,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 			frag = skb;
 		}
 
-		IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
+		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
+			      IPSTATS_MIB_FRAGFAILS);
 		dst_release(&rt->u.dst);
 		return err;
 	}
@@ -793,7 +803,7 @@ slow_path:
 
 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
-			IP6_INC_STATS(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 				      IPSTATS_MIB_FRAGFAILS);
 			err = -ENOMEM;
 			goto fail;
@@ -857,15 +867,16 @@ slow_path:
 		if (err)
 			goto fail;
 
-		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
+		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+			      IPSTATS_MIB_FRAGCREATES);
 	}
-	IP6_INC_STATS(ip6_dst_idev(skb->dst),
+	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 		      IPSTATS_MIB_FRAGOKS);
 	kfree_skb(skb);
 	return err;
 
 fail:
-	IP6_INC_STATS(ip6_dst_idev(skb->dst),
+	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 		      IPSTATS_MIB_FRAGFAILS);
 	kfree_skb(skb);
 	return err;
@@ -1385,7 +1396,7 @@ alloc_new_skb:
 	return 0;
 error:
 	inet->cork.length -= length;
-	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 	return err;
 }
 
@@ -1409,6 +1420,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net *net = sock_net(sk);
 	struct ipv6hdr *hdr;
 	struct ipv6_txoptions *opt = np->cork.opt;
 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
@@ -1462,7 +1474,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	skb->mark = sk->sk_mark;
 
 	skb->dst = dst_clone(&rt->u.dst);
-	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	if (proto == IPPROTO_ICMPV6) {
 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 
@@ -1491,7 +1503,7 @@ void ip6_flush_pending_frames(struct sock *sk)
 
 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
 		if (skb->dst)
-			IP6_INC_STATS(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
 				      IPSTATS_MIB_OUTDISCARDS);
 		kfree_skb(skb);
 	}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index e7c03bcc278..a1d588d0b5d 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1446,7 +1446,7 @@ static void mld_sendpack(struct sk_buff *skb)
 	int err;
 	struct flowi fl;
 
-	IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 	payload_len = (skb->tail - skb->network_header) - sizeof(*pip6);
 	mldlen = skb->tail - skb->transport_header;
 	pip6->payload_len = htons(payload_len);
@@ -1771,7 +1771,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 	struct flowi fl;
 
 	rcu_read_lock();
-	IP6_INC_STATS(__in6_dev_get(dev),
+	IP6_INC_STATS(net, __in6_dev_get(dev),
 		      IPSTATS_MIB_OUTREQUESTS);
 	rcu_read_unlock();
 	if (type == ICMPV6_MGM_REDUCTION)
@@ -1787,7 +1787,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 
 	if (skb == NULL) {
 		rcu_read_lock();
-		IP6_INC_STATS(__in6_dev_get(dev),
+		IP6_INC_STATS(net, __in6_dev_get(dev),
 			      IPSTATS_MIB_OUTDISCARDS);
 		rcu_read_unlock();
 		return;
@@ -1841,9 +1841,9 @@ out:
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS(idev, type);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
-		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
-		IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 
 	if (likely(idev != NULL))
 		in6_dev_put(idev);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f1c62ba0f56..ce5b61763fd 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -516,7 +516,7 @@ static void __ndisc_send(struct net_device *dev,
 	skb->dst = dst;
 
 	idev = in6_dev_get(dst->dev);
-	IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 		      dst_output);
@@ -1581,7 +1581,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 
 	buff->dst = dst;
 	idev = in6_dev_get(dst->dev);
-	IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev,
 		      dst_output);
 	if (!err) {
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 4cb4844a322..6b29b03925f 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -33,7 +33,8 @@ int ip6_route_me_harder(struct sk_buff *skb)
 #endif
 
 	if (dst->error) {
-		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+		IP6_INC_STATS(&init_net, ip6_dst_idev(dst),
+			      IPSTATS_MIB_OUTNOROUTES);
 		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
 		dst_release(dst);
 		return -EINVAL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e53e493606c..2ba04d41dc2 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -638,7 +638,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	if (err)
 		goto error_fault;
 
-	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 		      dst_output);
 	if (err > 0)
@@ -652,7 +652,7 @@ error_fault:
 	err = -EFAULT;
 	kfree_skb(skb);
 error:
-	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 	return err;
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index f4f62f08609..63644075f2d 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -433,7 +433,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -1;
 
 err:
-	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS);
+	IP6_INC_STATS(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
+		      IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
 	return -1;
 }
@@ -573,7 +574,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct frag_hdr *fhdr;
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
-	struct net *net;
+	struct net *net = dev_net(skb->dst->dev);
 
 	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS);
 
@@ -597,7 +598,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	net = dev_net(skb->dev);
 	if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh)
 		ip6_evictor(net, ip6_dst_idev(skb->dst));
 
@@ -619,7 +619,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	return -1;
 
 fail_hdr:
-	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
+	IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
 	return -1;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e10a1701550..89dc6992434 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1833,16 +1833,19 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
 {
 	int type;
+	struct dst_entry *dst = skb->dst;
 	switch (ipstats_mib_noroutes) {
 	case IPSTATS_MIB_INNOROUTES:
 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
 		if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
-			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
+			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+				      IPSTATS_MIB_INADDRERRORS);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IPSTATS_MIB_OUTNOROUTES:
-		IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
+		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+			      ipstats_mib_noroutes);
 		break;
 	}
 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
-- 
cgit v1.2.3


From 483a47d2fe794328d29950fe00ce26dd405d9437 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 11:09:27 -0700
Subject: ipv6: added net argument to IP6_INC_STATS_BH

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/exthdrs.c    | 42 ++++++++++++++++++++++++------------------
 net/ipv6/ip6_input.c  | 27 ++++++++++++++++-----------
 net/ipv6/ip6_output.c | 18 +++++++++++-------
 net/ipv6/ip6mr.c      |  3 ++-
 net/ipv6/mcast.c      |  4 ++--
 net/ipv6/reassembly.c | 26 +++++++++++++++-----------
 6 files changed, 70 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 837c830d6d8..6bfffec2371 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -277,7 +277,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
 				 IPSTATS_MIB_INHDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -301,7 +301,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+	IP6_INC_STATS_BH(dev_net(dst->dev),
+			 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 	dst_release(dst);
 	return -1;
 }
@@ -319,7 +320,8 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 	int n, i;
 	struct ipv6_rt_hdr *hdr;
 	struct rt0_hdr *rthdr;
-	int accept_source_route = dev_net(skb->dev)->ipv6.devconf_all->accept_source_route;
+	struct net *net = dev_net(skb->dev);
+	int accept_source_route = net->ipv6.devconf_all->accept_source_route;
 
 	idev = in6_dev_get(skb->dev);
 	if (idev) {
@@ -331,7 +333,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 				 IPSTATS_MIB_INHDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -341,7 +343,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 
 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
 	    skb->pkt_type != PACKET_HOST) {
-		IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 				 IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -356,7 +358,7 @@ looped_back:
 			 * processed by own
 			 */
 			if (!addr) {
-				IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+				IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 						 IPSTATS_MIB_INADDRERRORS);
 				kfree_skb(skb);
 				return -1;
@@ -382,7 +384,7 @@ looped_back:
 			goto unknown_rh;
 		/* Silently discard invalid RTH type 2 */
 		if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
-			IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 					 IPSTATS_MIB_INHDRERRORS);
 			kfree_skb(skb);
 			return -1;
@@ -401,7 +403,7 @@ looped_back:
 	n = hdr->hdrlen >> 1;
 
 	if (hdr->segments_left > n) {
-		IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 				 IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((&hdr->segments_left) -
@@ -415,7 +417,7 @@ looped_back:
 	if (skb_cloned(skb)) {
 		/* the copy is a forwarded packet */
 		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
-			IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 					 IPSTATS_MIB_OUTDISCARDS);
 			kfree_skb(skb);
 			return -1;
@@ -438,13 +440,13 @@ looped_back:
 		if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
 				     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
 				     IPPROTO_ROUTING) < 0) {
-			IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 					 IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
 		if (!ipv6_chk_home_addr(dev_net(skb->dst->dev), addr)) {
-			IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 					 IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
@@ -456,7 +458,7 @@ looped_back:
 	}
 
 	if (ipv6_addr_is_multicast(addr)) {
-		IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 				 IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -476,7 +478,7 @@ looped_back:
 
 	if (skb->dst->dev->flags&IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
-			IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 					 IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 				    0, skb->dev);
@@ -492,7 +494,7 @@ looped_back:
 	return -1;
 
 unknown_rh:
-	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 			  (&hdr->type) - skb_network_header(skb));
 	return -1;
@@ -579,29 +581,33 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
 {
 	const unsigned char *nh = skb_network_header(skb);
 	u32 pkt_len;
+	struct net *net = dev_net(skb->dst->dev);
 
 	if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
 		LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
 			       nh[optoff+1]);
-		IP6_INC_STATS_BH(ipv6_skb_idev(skb),
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
 				 IPSTATS_MIB_INHDRERRORS);
 		goto drop;
 	}
 
 	pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
 	if (pkt_len <= IPV6_MAXPLEN) {
-		IP6_INC_STATS_BH(ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS);
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+				 IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
 		return 0;
 	}
 	if (ipv6_hdr(skb)->payload_len) {
-		IP6_INC_STATS_BH(ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS);
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+				 IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
 		return 0;
 	}
 
 	if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
-		IP6_INC_STATS_BH(ipv6_skb_idev(skb), IPSTATS_MIB_INTRUNCATEDPKTS);
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+				 IPSTATS_MIB_INTRUNCATEDPKTS);
 		goto drop;
 	}
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 7e14cccd056..936f48946e2 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -59,6 +59,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	struct ipv6hdr *hdr;
 	u32 		pkt_len;
 	struct inet6_dev *idev;
+	struct net *net = dev_net(skb->dev);
 
 	if (skb->pkt_type == PACKET_OTHERHOST) {
 		kfree_skb(skb);
@@ -69,11 +70,11 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 
 	idev = __in6_dev_get(skb->dev);
 
-	IP6_INC_STATS_BH(idev, IPSTATS_MIB_INRECEIVES);
+	IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INRECEIVES);
 
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
 	    !idev || unlikely(idev->cnf.disable_ipv6)) {
-		IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDISCARDS);
+		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
 		rcu_read_unlock();
 		goto out;
 	}
@@ -118,11 +119,12 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	/* pkt_len may be zero if Jumbo payload option is present */
 	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
 		if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
-			IP6_INC_STATS_BH(idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+			IP6_INC_STATS_BH(net,
+					 idev, IPSTATS_MIB_INTRUNCATEDPKTS);
 			goto drop;
 		}
 		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) {
-			IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS);
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
 			goto drop;
 		}
 		hdr = ipv6_hdr(skb);
@@ -130,7 +132,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 
 	if (hdr->nexthdr == NEXTHDR_HOP) {
 		if (ipv6_parse_hopopts(skb) < 0) {
-			IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS);
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
 			rcu_read_unlock();
 			return 0;
 		}
@@ -141,7 +143,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	return NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, dev, NULL,
 		       ip6_rcv_finish);
 err:
-	IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS);
+	IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
 drop:
 	rcu_read_unlock();
 	kfree_skb(skb);
@@ -161,6 +163,7 @@ static int ip6_input_finish(struct sk_buff *skb)
 	int nexthdr, raw;
 	u8 hash;
 	struct inet6_dev *idev;
+	struct net *net = dev_net(skb->dst->dev);
 
 	/*
 	 *	Parse extension headers
@@ -205,24 +208,25 @@ resubmit:
 		if (ret > 0)
 			goto resubmit;
 		else if (ret == 0)
-			IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDELIVERS);
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
 	} else {
 		if (!raw) {
 			if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-				IP6_INC_STATS_BH(idev, IPSTATS_MIB_INUNKNOWNPROTOS);
+				IP6_INC_STATS_BH(net, idev,
+						 IPSTATS_MIB_INUNKNOWNPROTOS);
 				icmpv6_send(skb, ICMPV6_PARAMPROB,
 					    ICMPV6_UNK_NEXTHDR, nhoff,
 					    skb->dev);
 			}
 		} else
-			IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDELIVERS);
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
 		kfree_skb(skb);
 	}
 	rcu_read_unlock();
 	return 0;
 
 discard:
-	IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDISCARDS);
+	IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
 	rcu_read_unlock();
 	kfree_skb(skb);
 	return 0;
@@ -240,7 +244,8 @@ int ip6_mc_input(struct sk_buff *skb)
 	struct ipv6hdr *hdr;
 	int deliver;
 
-	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCASTPKTS);
+	IP6_INC_STATS_BH(dev_net(skb->dst->dev),
+			 ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCASTPKTS);
 
 	hdr = ipv6_hdr(skb);
 	deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index db28c208f32..f0fded630f5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -103,7 +103,8 @@ static int ip6_output_finish(struct sk_buff *skb)
 	else if (dst->neighbour)
 		return dst->neighbour->output(skb);
 
-	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+	IP6_INC_STATS_BH(dev_net(dst->dev),
+			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 	kfree_skb(skb);
 	return -EINVAL;
 
@@ -458,7 +459,8 @@ int ip6_forward(struct sk_buff *skb)
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 			    0, skb->dev);
-		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 
 		kfree_skb(skb);
 		return -ETIMEDOUT;
@@ -527,8 +529,10 @@ int ip6_forward(struct sk_buff *skb)
 		/* Again, force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
-		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
-		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -544,12 +548,12 @@ int ip6_forward(struct sk_buff *skb)
 
 	hdr->hop_limit--;
 
-	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 		       ip6_forward_finish);
 
 error:
-	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 drop:
 	kfree_skb(skb);
 	return -EINVAL;
@@ -991,7 +995,7 @@ static int ip6_dst_lookup_tail(struct sock *sk,
 
 out_err_release:
 	if (err == -ENETUNREACH)
-		IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
+		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 	dst_release(*dst);
 	*dst = NULL;
 	return err;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 095bc453ff4..182f8a177e7 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1383,7 +1383,8 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 
 static inline int ip6mr_forward2_finish(struct sk_buff *skb)
 {
-	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP6_INC_STATS_BH(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
+			 IPSTATS_MIB_OUTFORWDATAGRAMS);
 	return dst_output(skb);
 }
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index a1d588d0b5d..88811ebbe4b 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1476,9 +1476,9 @@ out:
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS_BH(idev, ICMPV6_MLD2_REPORT);
 		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
-		IP6_INC_STATS_BH(idev, IPSTATS_MIB_OUTMCASTPKTS);
+		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
-		IP6_INC_STATS_BH(idev, IPSTATS_MIB_OUTDISCARDS);
+		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS);
 
 	if (likely(idev != NULL))
 		in6_dev_put(idev);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 63644075f2d..693d20836b3 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -213,8 +213,8 @@ static void ip6_frag_expire(unsigned long data)
 		goto out;
 
 	rcu_read_lock();
-	IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-	IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 	rcu_read_unlock();
 
 	/* Don't send error if the first segment did not arrive. */
@@ -257,7 +257,7 @@ fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst,
 	return container_of(q, struct frag_queue, q);
 
 oom:
-	IP6_INC_STATS_BH(idev, IPSTATS_MIB_REASMFAILS);
+	IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_REASMFAILS);
 	return NULL;
 }
 
@@ -267,6 +267,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	struct sk_buff *prev, *next;
 	struct net_device *dev;
 	int offset, end;
+	struct net *net = dev_net(skb->dst->dev);
 
 	if (fq->q.last_in & INET_FRAG_COMPLETE)
 		goto err;
@@ -276,7 +277,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
-		IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 				 IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((u8 *)&fhdr->frag_off -
@@ -309,7 +310,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			/* RFC2460 says always send parameter problem in
 			 * this case. -DaveM
 			 */
-			IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
 					 IPSTATS_MIB_INHDRERRORS);
 			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 					  offsetof(struct ipv6hdr, payload_len));
@@ -433,7 +434,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -1;
 
 err:
-	IP6_INC_STATS(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
+	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 		      IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
 	return -1;
@@ -550,7 +551,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 					  head->csum);
 
 	rcu_read_lock();
-	IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+	IP6_INC_STATS_BH(dev_net(dev),
+			 __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
 	rcu_read_unlock();
 	fq->q.fragments = NULL;
 	return 1;
@@ -564,7 +566,8 @@ out_oom:
 		printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n");
 out_fail:
 	rcu_read_lock();
-	IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+	IP6_INC_STATS_BH(dev_net(dev),
+			 __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 	rcu_read_unlock();
 	return -1;
 }
@@ -576,7 +579,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct net *net = dev_net(skb->dst->dev);
 
-	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS);
 
 	/* Jumbo payload inhibits frag. header */
 	if (hdr->payload_len==0)
@@ -592,7 +595,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	if (!(fhdr->frag_off & htons(0xFFF9))) {
 		/* It is not a fragmented frame */
 		skb->transport_header += sizeof(struct frag_hdr);
-		IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMOKS);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMOKS);
 
 		IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
 		return 1;
@@ -614,7 +618,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return ret;
 	}
 
-	IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
 	return -1;
 
-- 
cgit v1.2.3


From 821d57776d4dda47ef5f0c33fdb3c761214b2f9f Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:32:43 -0700
Subject: ipv6: added net argument to IP6_ADD_STATS_BH

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 693d20836b3..af12de071f4 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -189,7 +189,7 @@ static void ip6_evictor(struct net *net, struct inet6_dev *idev)
 
 	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags);
 	if (evicted)
-		IP6_ADD_STATS_BH(idev, IPSTATS_MIB_REASMFAILS, evicted);
+		IP6_ADD_STATS_BH(net, idev, IPSTATS_MIB_REASMFAILS, evicted);
 }
 
 static void ip6_frag_expire(unsigned long data)
-- 
cgit v1.2.3


From a862f6a6dc89c57dd3a959a1636b59f0c27169c2 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:33:06 -0700
Subject: ipv6: added net argument to ICMP6_INC_STATS

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 2 +-
 net/ipv6/ndisc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 88811ebbe4b..fa413afeb99 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1840,7 +1840,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 out:
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS(idev, type);
-		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
+		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index ce5b61763fd..6ce238ca538 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -522,7 +522,7 @@ static void __ndisc_send(struct net_device *dev,
 		      dst_output);
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS(idev, type);
-		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
+		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
 	}
 
 	if (likely(idev != NULL))
@@ -1586,7 +1586,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 		      dst_output);
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS(idev, NDISC_REDIRECT);
-		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
+		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
 	}
 
 	if (likely(idev != NULL))
-- 
cgit v1.2.3


From e41b5368e029e79d11acb5952bc73284e5026c62 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:33:26 -0700
Subject: ipv6: added net argument to ICMP6_INC_STATS_BH

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv6.c       | 6 ++++--
 net/ipv6/icmp.c       | 4 ++--
 net/ipv6/ip6_output.c | 2 +-
 net/ipv6/mcast.c      | 2 +-
 net/ipv6/tcp_ipv6.c   | 3 ++-
 net/sctp/ipv6.c       | 2 +-
 6 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index caa7f346962..11062780bb0 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -98,7 +98,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (skb->len < offset + sizeof(*dh) ||
 	    skb->len < offset + __dccp_basic_hdr_len(dh)) {
-		ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+		ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+				   ICMP6_MIB_INERRORS);
 		return;
 	}
 
@@ -107,7 +108,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			&hdr->saddr, dh->dccph_sport, inet6_iif(skb));
 
 	if (sk == NULL) {
-		ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+		ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+				   ICMP6_MIB_INERRORS);
 		return;
 	}
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 758cdd7dc27..4c961556306 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -664,7 +664,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
 		skb_set_network_header(skb, nh);
 	}
 
-	ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INMSGS);
+	ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS);
 
 	saddr = &ipv6_hdr(skb)->saddr;
 	daddr = &ipv6_hdr(skb)->daddr;
@@ -772,7 +772,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
 	return 0;
 
 discard_it:
-	ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS);
+	ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS);
 drop_no_count:
 	kfree_skb(skb);
 	return 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f0fded630f5..e7eff320619 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1483,7 +1483,7 @@ int ip6_push_pending_frames(struct sock *sk)
 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 
 		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
-		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
+		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
 	}
 
 	err = ip6_local_out(skb);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index fa413afeb99..f06ceea1ffa 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1475,7 +1475,7 @@ static void mld_sendpack(struct sk_buff *skb)
 out:
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS_BH(idev, ICMPV6_MLD2_REPORT);
-		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
+		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
 		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
 		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6268d266c03..424d9c4a67a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -330,7 +330,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			th->dest, &hdr->saddr, th->source, skb->dev->ifindex);
 
 	if (sk == NULL) {
-		ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+		ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+				   ICMP6_MIB_INERRORS);
 		return;
 	}
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c78da3c9dd3..4124bbb9994 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -156,7 +156,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	skb->network_header   = saveip;
 	skb->transport_header = savesctp;
 	if (!sk) {
-		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS);
+		ICMP6_INC_STATS_BH(dev_net(skb->dev), idev, ICMP6_MIB_INERRORS);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 5c5d244bd388fe498dd7f5f57cb7770aae40b9ab Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:33:50 -0700
Subject: ipv6: added net argument to ICMP6MSGOUT_INC_STATS

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 2 +-
 net/ipv6/ndisc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f06ceea1ffa..a96e4235fbf 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1839,7 +1839,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 		      dst_output);
 out:
 	if (!err) {
-		ICMP6MSGOUT_INC_STATS(idev, type);
+		ICMP6MSGOUT_INC_STATS(net, idev, type);
 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 6ce238ca538..840b15780a3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -521,7 +521,7 @@ static void __ndisc_send(struct net_device *dev,
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 		      dst_output);
 	if (!err) {
-		ICMP6MSGOUT_INC_STATS(idev, type);
+		ICMP6MSGOUT_INC_STATS(net, idev, type);
 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
 	}
 
@@ -1585,7 +1585,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev,
 		      dst_output);
 	if (!err) {
-		ICMP6MSGOUT_INC_STATS(idev, NDISC_REDIRECT);
+		ICMP6MSGOUT_INC_STATS(net, idev, NDISC_REDIRECT);
 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
 	}
 
-- 
cgit v1.2.3


From 5a57d4c7fdac0e227efe8c5739fcbb263d9ae993 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:34:14 -0700
Subject: ipv6: added net argument to ICMP6MSGOUT_INC_STATS_BH

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 2 +-
 net/ipv6/mcast.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e7eff320619..c77db0b95e2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1482,7 +1482,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	if (proto == IPPROTO_ICMPV6) {
 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 
-		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
+		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
 	}
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index a96e4235fbf..d7b3c6d398a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1474,7 +1474,7 @@ static void mld_sendpack(struct sk_buff *skb)
 		      dst_output);
 out:
 	if (!err) {
-		ICMP6MSGOUT_INC_STATS_BH(idev, ICMPV6_MLD2_REPORT);
+		ICMP6MSGOUT_INC_STATS_BH(net, idev, ICMPV6_MLD2_REPORT);
 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
 		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
-- 
cgit v1.2.3


From 55d43808eb26e689dacb95b11f956a3b1a56a5f3 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:34:54 -0700
Subject: ipv6: added net argument to ICMP6MSGIN_INC_STATS_BH

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 4c961556306..9b7d19ae5ce 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -693,7 +693,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
 
 	type = hdr->icmp6_type;
 
-	ICMP6MSGIN_INC_STATS_BH(idev, type);
+	ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type);
 
 	switch (type) {
 	case ICMPV6_ECHO_REQUEST:
-- 
cgit v1.2.3


From 9261e53701121f83eb9482347d68833e95315362 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:36:03 -0700
Subject: ipv6: making ip and icmp statistics per/namespace

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 38 ++++++++++++++++++--------------------
 net/ipv6/proc.c     |  8 +++++---
 2 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 127b240d2d8..6b509d7700d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -797,31 +797,11 @@ static void ipv6_packet_cleanup(void)
 
 static int __init init_ipv6_mibs(void)
 {
-	if (snmp_mib_init((void **)ipv6_statistics,
-			  sizeof(struct ipstats_mib)) < 0)
-		goto err_ip_mib;
-	if (snmp_mib_init((void **)icmpv6_statistics,
-			  sizeof(struct icmpv6_mib)) < 0)
-		goto err_icmp_mib;
-	if (snmp_mib_init((void **)icmpv6msg_statistics,
-			  sizeof(struct icmpv6msg_mib)) < 0)
-		goto err_icmpmsg_mib;
 	return 0;
-
-err_icmpmsg_mib:
-	snmp_mib_free((void **)icmpv6_statistics);
-err_icmp_mib:
-	snmp_mib_free((void **)ipv6_statistics);
-err_ip_mib:
-	return -ENOMEM;
-
 }
 
 static void cleanup_ipv6_mibs(void)
 {
-	snmp_mib_free((void **)ipv6_statistics);
-	snmp_mib_free((void **)icmpv6_statistics);
-	snmp_mib_free((void **)icmpv6msg_statistics);
 }
 
 static int __net_init ipv6_init_mibs(struct net *net)
@@ -832,8 +812,23 @@ static int __net_init ipv6_init_mibs(struct net *net)
 	if (snmp_mib_init((void **)net->mib.udplite_stats_in6,
 			  sizeof (struct udp_mib)) < 0)
 		goto err_udplite_mib;
+	if (snmp_mib_init((void **)net->mib.ipv6_statistics,
+			  sizeof(struct ipstats_mib)) < 0)
+		goto err_ip_mib;
+	if (snmp_mib_init((void **)net->mib.icmpv6_statistics,
+			  sizeof(struct icmpv6_mib)) < 0)
+		goto err_icmp_mib;
+	if (snmp_mib_init((void **)net->mib.icmpv6msg_statistics,
+			  sizeof(struct icmpv6msg_mib)) < 0)
+		goto err_icmpmsg_mib;
 	return 0;
 
+err_icmpmsg_mib:
+	snmp_mib_free((void **)net->mib.icmpv6_statistics);
+err_icmp_mib:
+	snmp_mib_free((void **)net->mib.ipv6_statistics);
+err_ip_mib:
+	snmp_mib_free((void **)net->mib.udplite_stats_in6);
 err_udplite_mib:
 	snmp_mib_free((void **)net->mib.udp_stats_in6);
 	return -ENOMEM;
@@ -843,6 +838,9 @@ static void __net_exit ipv6_cleanup_mibs(struct net *net)
 {
 	snmp_mib_free((void **)net->mib.udp_stats_in6);
 	snmp_mib_free((void **)net->mib.udplite_stats_in6);
+	snmp_mib_free((void **)net->mib.ipv6_statistics);
+	snmp_mib_free((void **)net->mib.icmpv6_statistics);
+	snmp_mib_free((void **)net->mib.icmpv6msg_statistics);
 }
 
 static int inet6_net_init(struct net *net)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index c78cf754ef3..07f0b76e742 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -177,9 +177,11 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = (struct net *)seq->private;
 
-	snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
-	snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
-	snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics);
+	snmp6_seq_show_item(seq, (void **)net->mib.ipv6_statistics,
+			    snmp6_ipstats_list);
+	snmp6_seq_show_item(seq, (void **)net->mib.icmpv6_statistics,
+			    snmp6_icmp6_list);
+	snmp6_seq_show_icmpv6msg(seq, (void **)net->mib.icmpv6msg_statistics);
 	snmp6_seq_show_item(seq, (void **)net->mib.udp_stats_in6,
 			    snmp6_udp6_list);
 	snmp6_seq_show_item(seq, (void **)net->mib.udplite_stats_in6,
-- 
cgit v1.2.3


From 2ca89cea5c9fdafd495fb840fa055383d253174e Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 8 Oct 2008 10:36:24 -0700
Subject: ipv6: remove unused not init_ipv6_mibs/cleanup_ipv6_mibs

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6b509d7700d..050e14b7f70 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -795,15 +795,6 @@ static void ipv6_packet_cleanup(void)
 	dev_remove_pack(&ipv6_packet_type);
 }
 
-static int __init init_ipv6_mibs(void)
-{
-	return 0;
-}
-
-static void cleanup_ipv6_mibs(void)
-{
-}
-
 static int __net_init ipv6_init_mibs(struct net *net)
 {
 	if (snmp_mib_init((void **)net->mib.udp_stats_in6,
@@ -935,11 +926,6 @@ static int __init inet6_init(void)
 	if (err)
 		goto out_sock_register_fail;
 
-	/* Initialise ipv6 mibs */
-	err = init_ipv6_mibs();
-	if (err)
-		goto out_unregister_sock;
-
 #ifdef CONFIG_SYSCTL
 	err = ipv6_static_sysctl_register();
 	if (err)
@@ -1073,8 +1059,6 @@ register_pernet_fail:
 	ipv6_static_sysctl_unregister();
 static_sysctl_fail:
 #endif
-	cleanup_ipv6_mibs();
-out_unregister_sock:
 	sock_unregister(PF_INET6);
 	rtnl_unregister_all(PF_INET6);
 out_sock_register_fail:
@@ -1131,7 +1115,6 @@ static void __exit inet6_exit(void)
 #ifdef CONFIG_SYSCTL
 	ipv6_static_sysctl_unregister();
 #endif
-	cleanup_ipv6_mibs();
 	proto_unregister(&rawv6_prot);
 	proto_unregister(&udplitev6_prot);
 	proto_unregister(&udpv6_prot);
-- 
cgit v1.2.3


From 52cd5750e81ec8d213949fa7c0d2e08907bf498b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Wed, 8 Oct 2008 11:34:06 -0700
Subject: tcp: fix length used for checksum in a reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While looking for some common code I came across difference
in checksum calculation between tcp_v6_send_(reset|ack) I
couldn't explain. I checked both v4 and v6 and found out that
both seem to have the same "feature". I couldn't find anything
in rfc nor anywhere else which would state that md5 option
should be ignored like it was in case of reset so I came to
a conclusion that this is probably a genuine bug. I suspect
that addition of md5 just was fooled by the excessive
copy-paste code in those functions and the reset part was
never tested well enough to find out the problem.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 net/ipv6/tcp_ipv6.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 24ffc5e1d3d..ba46769c6e9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -589,7 +589,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 #endif
 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 				      ip_hdr(skb)->saddr, /* XXX */
-				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
+				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 424d9c4a67a..e8b0fdd9edb 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1012,14 +1012,14 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	}
 #endif
 
-	buff->csum = csum_partial((char *)t1, sizeof(*t1), 0);
+	buff->csum = csum_partial((char *)t1, tot_len, 0);
 
 	memset(&fl, 0, sizeof(fl));
 	ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr);
 	ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr);
 
 	t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
-				    sizeof(*t1), IPPROTO_TCP,
+				    tot_len, IPPROTO_TCP,
 				    buff->csum);
 
 	fl.proto = IPPROTO_TCP;
-- 
cgit v1.2.3


From 53e915034970935596703a6005cde27c2128b5c3 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Wed, 8 Oct 2008 11:36:22 -0700
Subject: pkt_sched: Update qdisc requeue stats in dev_requeue_skb()

After the last change of requeuing there is no info about such
incidents in tc stats. This patch updates the counter, but we should
consider this should differ from previous stats because of additional
checks preventing to repeat this. On the other hand, previous stats
didn't include requeuing of gso_segmented skbs.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 31f6b614b59..7b5572d6beb 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -45,6 +45,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
 	q->gso_skb = skb;
+	q->qstats.requeues++;
 	__netif_schedule(q);
 
 	return 0;
-- 
cgit v1.2.3


From 9088c5609584684149f3fb5b065aa7f18dcb03ff Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 8 Oct 2008 11:44:17 -0700
Subject: udp: Improve port randomization

Current UDP port allocation is suboptimal.
We select the shortest chain to chose a port (out of 512)
that will hash in this shortest chain.

First, it can lead to give not so ramdom ports and ease
give attackers more opportunities to break the system.

Second, it can consume a lot of CPU to scan all table
in order to find the shortest chain.

Third, in some pathological cases we can fail to find
a free port even if they are plenty of them.

This patch zap the search for a short chain and only
use one random seed. Problem of getting long chains
should be addressed in another way, since we can
obtain long chains with non random ports.

Based on a report and patch from Vitaly Mayatskikh

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 56 ++++++++++++--------------------------------------------
 1 file changed, 12 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 85f8e8e10b1..67d8430b4a2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -155,55 +155,23 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	write_lock_bh(&udp_hash_lock);
 
 	if (!snum) {
-		int i, low, high, remaining;
-		unsigned rover, best, best_size_so_far;
+		int low, high, remaining;
+		unsigned rand;
+		unsigned short first;
 
 		inet_get_local_port_range(&low, &high);
 		remaining = (high - low) + 1;
 
-		best_size_so_far = UINT_MAX;
-		best = rover = net_random() % remaining + low;
-
-		/* 1st pass: look for empty (or shortest) hash chain */
-		for (i = 0; i < UDP_HTABLE_SIZE; i++) {
-			int size = 0;
-
-			head = &udptable[udp_hashfn(net, rover)];
-			if (hlist_empty(head))
-				goto gotit;
-
-			sk_for_each(sk2, node, head) {
-				if (++size >= best_size_so_far)
-					goto next;
-			}
-			best_size_so_far = size;
-			best = rover;
-		next:
-			/* fold back if end of range */
-			if (++rover > high)
-				rover = low + ((rover - low)
-					       & (UDP_HTABLE_SIZE - 1));
-
-
-		}
-
-		/* 2nd pass: find hole in shortest hash chain */
-		rover = best;
-		for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
-			if (! __udp_lib_lport_inuse(net, rover, udptable))
-				goto gotit;
-			rover += UDP_HTABLE_SIZE;
-			if (rover > high)
-				rover = low + ((rover - low)
-					       & (UDP_HTABLE_SIZE - 1));
+		rand = net_random();
+		snum = first = rand % remaining + low;
+		rand |= 1;
+		while (__udp_lib_lport_inuse(net, snum, udptable)) {
+			do {
+				snum = snum + rand;
+			} while (snum < low || snum > high);
+			if (snum == first)
+				goto fail;
 		}
-
-
-		/* All ports in use! */
-		goto fail;
-
-gotit:
-		snum = rover;
 	} else {
 		head = &udptable[udp_hashfn(net, snum)];
 
-- 
cgit v1.2.3


From 3c689b7320ae6f20dba6a8b71806a6c6fd604ee8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 8 Oct 2008 14:18:04 -0700
Subject: inet: cleanup of local_port_range

I noticed sysctl_local_port_range[] and its associated seqlock
sysctl_local_port_range_lock were on separate cache lines.
Moreover, sysctl_local_port_range[] was close to unrelated
variables, highly modified, leading to cache misses.

Moving these two variables in a structure can help data
locality and moving this structure to read_mostly section
helps sharing of this data among cpus.

Cleanup of extern declarations (moved in include file where
they belong), and use of inet_get_local_port_range()
accessor instead of direct access to ports values.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 16 +++++++++-------
 net/ipv4/sysctl_net_ipv4.c      | 23 ++++++++++-------------
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 21fcc5a9045..bd1278a2d82 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -30,20 +30,22 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
 #endif
 
 /*
- * This array holds the first and last local port number.
+ * This struct holds the first and last local port number.
  */
-int sysctl_local_port_range[2] = { 32768, 61000 };
-DEFINE_SEQLOCK(sysctl_port_range_lock);
+struct local_ports sysctl_local_ports __read_mostly = {
+	.lock = SEQLOCK_UNLOCKED,
+	.range = { 32768, 61000 },
+};
 
 void inet_get_local_port_range(int *low, int *high)
 {
 	unsigned seq;
 	do {
-		seq = read_seqbegin(&sysctl_port_range_lock);
+		seq = read_seqbegin(&sysctl_local_ports.lock);
 
-		*low = sysctl_local_port_range[0];
-		*high = sysctl_local_port_range[1];
-	} while (read_seqretry(&sysctl_port_range_lock, seq));
+		*low = sysctl_local_ports.range[0];
+		*high = sysctl_local_ports.range[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
 }
 EXPORT_SYMBOL(inet_get_local_port_range);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0689fd7b79..276d047fb85 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -26,16 +26,13 @@ static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
 
-extern seqlock_t sysctl_port_range_lock;
-extern int sysctl_local_port_range[2];
-
 /* Update system visible IP port range */
 static void set_local_port_range(int range[2])
 {
-	write_seqlock(&sysctl_port_range_lock);
-	sysctl_local_port_range[0] = range[0];
-	sysctl_local_port_range[1] = range[1];
-	write_sequnlock(&sysctl_port_range_lock);
+	write_seqlock(&sysctl_local_ports.lock);
+	sysctl_local_ports.range[0] = range[0];
+	sysctl_local_ports.range[1] = range[1];
+	write_sequnlock(&sysctl_local_ports.lock);
 }
 
 /* Validate changes from /proc interface. */
@@ -44,8 +41,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
 				 size_t *lenp, loff_t *ppos)
 {
 	int ret;
-	int range[2] = { sysctl_local_port_range[0],
-			 sysctl_local_port_range[1] };
+	int range[2];
 	ctl_table tmp = {
 		.data = &range,
 		.maxlen = sizeof(range),
@@ -54,6 +50,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
 		.extra2 = &ip_local_port_range_max,
 	};
 
+	inet_get_local_port_range(range, range + 1);
 	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
@@ -73,8 +70,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name,
 					void __user *newval, size_t newlen)
 {
 	int ret;
-	int range[2] = { sysctl_local_port_range[0],
-			 sysctl_local_port_range[1] };
+	int range[2];
 	ctl_table tmp = {
 		.data = &range,
 		.maxlen = sizeof(range),
@@ -83,6 +79,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name,
 		.extra2 = &ip_local_port_range_max,
 	};
 
+	inet_get_local_port_range(range, range + 1);
 	ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen);
 	if (ret == 0 && newval && newlen) {
 		if (range[1] < range[0])
@@ -396,8 +393,8 @@ static struct ctl_table ipv4_table[] = {
 	{
 		.ctl_name	= NET_IPV4_LOCAL_PORT_RANGE,
 		.procname	= "ip_local_port_range",
-		.data		= &sysctl_local_port_range,
-		.maxlen		= sizeof(sysctl_local_port_range),
+		.data		= &sysctl_local_ports.range,
+		.maxlen		= sizeof(sysctl_local_ports.range),
 		.mode		= 0644,
 		.proc_handler	= &ipv4_local_port_range,
 		.strategy	= &ipv4_sysctl_local_port_range,
-- 
cgit v1.2.3


From 8e1ee18c332e08bee9d8bd66e63cd564fbf17fc2 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Wed, 8 Oct 2008 14:18:39 -0700
Subject: sctp: Rework the tsn map to use generic bitmap.

The tsn map currently use is 4K large and is stuck inside
the sctp_association structure making memory references REALLY
expensive.  What we really need is at most 4K worth of bits
so the biggest map we would have is 512 bytes.   Also, the
map is only really usefull when we have gaps to store and
report.  As such, starting with minimal map of say 32 TSNs (bits)
should be enough for normal low-loss operations.  We can grow
the map by some multiple of 32 along with some extra room any
time we receive the TSN which would put us outside of the map
boundry.  As we close gaps, we can shift the map to rebase
it on the latest TSN we've seen.  This saves 4088 bytes per
association just in the map alone along savings from the now
unnecessary structure members.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c     |   9 +-
 net/sctp/sm_make_chunk.c |   5 +-
 net/sctp/sm_sideeffect.c |   3 +-
 net/sctp/tsnmap.c        | 331 +++++++++++++++++++++--------------------------
 net/sctp/ulpevent.c      |  10 +-
 5 files changed, 166 insertions(+), 192 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index abd51cef241..f4b23043b61 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -283,8 +283,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	if (!sctp_ulpq_init(&asoc->ulpq, asoc))
 		goto fail_init;
 
-	/* Set up the tsn tracking. */
-	sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_SIZE, 0);
+	memset(&asoc->peer.tsn_map, 0, sizeof(struct sctp_tsnmap));
 
 	asoc->need_ecne = 0;
 
@@ -402,6 +401,8 @@ void sctp_association_free(struct sctp_association *asoc)
 	/* Dispose of any pending chunks on the inqueue. */
 	sctp_inq_free(&asoc->base.inqueue);
 
+	sctp_tsnmap_free(&asoc->peer.tsn_map);
+
 	/* Free ssnmap storage. */
 	sctp_ssnmap_free(asoc->ssnmap);
 
@@ -1122,8 +1123,8 @@ void sctp_assoc_update(struct sctp_association *asoc,
 	asoc->peer.rwnd = new->peer.rwnd;
 	asoc->peer.sack_needed = new->peer.sack_needed;
 	asoc->peer.i = new->peer.i;
-	sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_SIZE,
-			 asoc->peer.i.initial_tsn);
+	sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
+			 asoc->peer.i.initial_tsn, GFP_ATOMIC);
 
 	/* Remove any peer addresses not present in the new association. */
 	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 76726bcff3e..6dd9b3ef33d 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2288,8 +2288,9 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
 	}
 
 	/* Set up the TSN tracking pieces.  */
-	sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_SIZE,
-			 asoc->peer.i.initial_tsn);
+	if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
+				asoc->peer.i.initial_tsn, gfp))
+		goto clean_up;
 
 	/* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
 	 *
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 13d9eea5cf1..e1d6076b4f5 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1152,7 +1152,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_REPORT_TSN:
 			/* Record the arrival of a TSN.  */
-			sctp_tsnmap_mark(&asoc->peer.tsn_map, cmd->obj.u32);
+			error = sctp_tsnmap_mark(&asoc->peer.tsn_map,
+						 cmd->obj.u32);
 			break;
 
 		case SCTP_CMD_REPORT_FWDTSN:
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index f3e58b27590..142ed7ca424 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -43,37 +43,44 @@
  */
 
 #include <linux/types.h>
+#include <linux/bitmap.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 
 static void sctp_tsnmap_update(struct sctp_tsnmap *map);
-static void sctp_tsnmap_find_gap_ack(__u8 *map, __u16 off,
-				     __u16 len, __u16 base,
-				     int *started, __u16 *start,
-				     int *ended, __u16 *end);
+static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
+				     __u16 len, __u16 *start, __u16 *end);
+static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap);
 
 /* Initialize a block of memory as a tsnmap.  */
 struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len,
-				     __u32 initial_tsn)
+				     __u32 initial_tsn, gfp_t gfp)
 {
-	map->tsn_map = map->raw_map;
-	map->overflow_map = map->tsn_map + len;
-	map->len = len;
-
-	/* Clear out a TSN ack status.  */
-	memset(map->tsn_map, 0x00, map->len + map->len);
+	if (!map->tsn_map) {
+		map->tsn_map = kzalloc(len>>3, gfp);
+		if (map->tsn_map == NULL)
+			return NULL;
+
+		map->len = len;
+	} else {
+		bitmap_zero(map->tsn_map, map->len);
+	}
 
 	/* Keep track of TSNs represented by tsn_map.  */
 	map->base_tsn = initial_tsn;
-	map->overflow_tsn = initial_tsn + map->len;
 	map->cumulative_tsn_ack_point = initial_tsn - 1;
 	map->max_tsn_seen = map->cumulative_tsn_ack_point;
-	map->malloced = 0;
 	map->num_dup_tsns = 0;
 
 	return map;
 }
 
+void sctp_tsnmap_free(struct sctp_tsnmap *map)
+{
+	map->len = 0;
+	kfree(map->tsn_map);
+}
+
 /* Test the tracking state of this TSN.
  * Returns:
  *   0 if the TSN has not yet been seen
@@ -82,66 +89,69 @@ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len,
  */
 int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn)
 {
-	__s32 gap;
-	int dup;
+	u32 gap;
+
+	/* Check to see if this is an old TSN */
+	if (TSN_lte(tsn, map->cumulative_tsn_ack_point))
+		return 1;
+
+	/* Verify that we can hold this TSN and that it will not
+	 * overlfow our map
+	 */
+	if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
+		return -1;
 
 	/* Calculate the index into the mapping arrays.  */
 	gap = tsn - map->base_tsn;
 
-	/* Verify that we can hold this TSN.  */
-	if (gap >= (/* base */ map->len + /* overflow */ map->len)) {
-		dup = -1;
-		goto out;
-	}
-
-	/* Honk if we've already seen this TSN.
-	 * We have three cases:
-	 *	1. The TSN is ancient or belongs to a previous tsn_map.
-	 *	2. The TSN is already marked in the tsn_map.
-	 *	3. The TSN is already marked in the tsn_map_overflow.
-	 */
-	if (gap < 0 ||
-	    (gap < map->len && map->tsn_map[gap]) ||
-	    (gap >= map->len && map->overflow_map[gap - map->len]))
-		dup = 1;
+	/* Check to see if TSN has already been recorded.  */
+	if (gap < map->len && test_bit(gap, map->tsn_map))
+		return 1;
 	else
-		dup = 0;
-
-out:
-	return dup;
+		return 0;
 }
 
 
 /* Mark this TSN as seen.  */
-void sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn)
+int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn)
 {
-	__s32 gap;
+	u16 gap;
 
-	/* Vacuously mark any TSN which precedes the map base or
-	 * exceeds the end of the map.
-	 */
 	if (TSN_lt(tsn, map->base_tsn))
-		return;
-	if (!TSN_lt(tsn, map->base_tsn + map->len + map->len))
-		return;
-
-	/* Bump the max.  */
-	if (TSN_lt(map->max_tsn_seen, tsn))
-		map->max_tsn_seen = tsn;
+		return 0;
 
-	/* Assert: TSN is in range.  */
 	gap = tsn - map->base_tsn;
 
-	/* Mark the TSN as received.  */
-	if (gap < map->len)
-		map->tsn_map[gap]++;
-	else
-		map->overflow_map[gap - map->len]++;
+	if (gap >= map->len && !sctp_tsnmap_grow(map, gap))
+		return -ENOMEM;
 
-	/* Go fixup any internal TSN mapping variables including
-	 * cumulative_tsn_ack_point.
-	 */
-	sctp_tsnmap_update(map);
+	if (!sctp_tsnmap_has_gap(map) && gap == 0) {
+		/* In this case the map has no gaps and the tsn we are
+		 * recording is the next expected tsn.  We don't touch
+		 * the map but simply bump the values.
+		 */
+		map->max_tsn_seen++;
+		map->cumulative_tsn_ack_point++;
+		map->base_tsn++;
+	} else {
+		/* Either we already have a gap, or about to record a gap, so
+		 * have work to do.
+		 *
+		 * Bump the max.
+		 */
+		if (TSN_lt(map->max_tsn_seen, tsn))
+			map->max_tsn_seen = tsn;
+
+		/* Mark the TSN as received.  */
+		set_bit(gap, map->tsn_map);
+
+		/* Go fixup any internal TSN mapping variables including
+		 * cumulative_tsn_ack_point.
+		 */
+		sctp_tsnmap_update(map);
+	}
+
+	return 0;
 }
 
 
@@ -160,66 +170,34 @@ SCTP_STATIC int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map,
 					 struct sctp_tsnmap_iter *iter,
 					 __u16 *start, __u16 *end)
 {
-	int started, ended;
-	__u16 start_, end_, offset;
-
-	/* We haven't found a gap yet.  */
-	started = ended = 0;
+	int ended = 0;
+	__u16 start_ = 0, end_ = 0, offset;
 
 	/* If there are no more gap acks possible, get out fast.  */
 	if (TSN_lte(map->max_tsn_seen, iter->start))
 		return 0;
 
-	/* Search the first mapping array.  */
-	if (iter->start - map->base_tsn < map->len) {
-
-		offset = iter->start - map->base_tsn;
-		sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len, 0,
-					 &started, &start_, &ended, &end_);
-	}
-
-	/* Do we need to check the overflow map? */
-	if (!ended) {
-		/* Fix up where we'd like to start searching in the
-		 * overflow map.
-		 */
-		if (iter->start - map->base_tsn < map->len)
-			offset = 0;
-		else
-			offset = iter->start - map->base_tsn - map->len;
-
-		/* Search the overflow map.  */
-		sctp_tsnmap_find_gap_ack(map->overflow_map,
-					 offset,
-					 map->len,
-					 map->len,
-					 &started, &start_,
-					 &ended, &end_);
-	}
+	offset = iter->start - map->base_tsn;
+	sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len,
+				 &start_, &end_);
 
-	/* The Gap Ack Block happens to end at the end of the
-	 * overflow map.
-	 */
-	if (started && !ended) {
-		ended++;
-		end_ = map->len + map->len - 1;
-	}
+	/* The Gap Ack Block happens to end at the end of the map. */
+	if (start_ && !end_)
+		end_ = map->len - 1;
 
 	/* If we found a Gap Ack Block, return the start and end and
 	 * bump the iterator forward.
 	 */
-	if (ended) {
+	if (end_) {
 		/* Fix up the start and end based on the
-		 * Cumulative TSN Ack offset into the map.
+		 * Cumulative TSN Ack which is always 1 behind base.
 		 */
-		int gap = map->cumulative_tsn_ack_point -
-			map->base_tsn;
-
-		*start = start_ - gap;
-		*end = end_ - gap;
+		*start = start_ + 1;
+		*end = end_ + 1;
 
 		/* Move the iterator forward.  */
 		iter->start = map->cumulative_tsn_ack_point + *end + 1;
+		ended = 1;
 	}
 
 	return ended;
@@ -228,35 +206,33 @@ SCTP_STATIC int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map,
 /* Mark this and any lower TSN as seen.  */
 void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn)
 {
-	__s32 gap;
+	u32 gap;
 
-	/* Vacuously mark any TSN which precedes the map base or
-	 * exceeds the end of the map.
-	 */
 	if (TSN_lt(tsn, map->base_tsn))
 		return;
-	if (!TSN_lt(tsn, map->base_tsn + map->len + map->len))
+	if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
 		return;
 
 	/* Bump the max.  */
 	if (TSN_lt(map->max_tsn_seen, tsn))
 		map->max_tsn_seen = tsn;
 
-	/* Assert: TSN is in range.  */
 	gap = tsn - map->base_tsn + 1;
 
-	/* Mark the TSNs as received.  */
-	if (gap <= map->len)
-		memset(map->tsn_map, 0x01, gap);
-	else {
-		memset(map->tsn_map, 0x01, map->len);
-		memset(map->overflow_map, 0x01, (gap - map->len));
+	map->base_tsn += gap;
+	map->cumulative_tsn_ack_point += gap;
+	if (gap >= map->len) {
+		/* If our gap is larger then the map size, just
+		 * zero out the map.
+		 */
+		bitmap_zero(map->tsn_map, map->len);
+	} else {
+		/* If the gap is smaller then the map size,
+		 * shift the map by 'gap' bits and update further.
+		 */
+		bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len);
+		sctp_tsnmap_update(map);
 	}
-
-	/* Go fixup any internal TSN mapping variables including
-	 * cumulative_tsn_ack_point.
-	 */
-	sctp_tsnmap_update(map);
 }
 
 /********************************************************************
@@ -268,27 +244,19 @@ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn)
  */
 static void sctp_tsnmap_update(struct sctp_tsnmap *map)
 {
-	__u32 ctsn;
-
-	ctsn = map->cumulative_tsn_ack_point;
-	do {
-		ctsn++;
-		if (ctsn == map->overflow_tsn) {
-			/* Now tsn_map must have been all '1's,
-			 * so we swap the map and check the overflow table
-			 */
-			__u8 *tmp = map->tsn_map;
-			memset(tmp, 0, map->len);
-			map->tsn_map = map->overflow_map;
-			map->overflow_map = tmp;
-
-			/* Update the tsn_map boundaries.  */
-			map->base_tsn += map->len;
-			map->overflow_tsn += map->len;
-		}
-	} while (map->tsn_map[ctsn - map->base_tsn]);
+	u16 len;
+	unsigned long zero_bit;
+
+
+	len = map->max_tsn_seen - map->cumulative_tsn_ack_point;
+	zero_bit = find_first_zero_bit(map->tsn_map, len);
+	if (!zero_bit)
+		return;		/* The first 0-bit is bit 0.  nothing to do */
+
+	map->base_tsn += zero_bit;
+	map->cumulative_tsn_ack_point += zero_bit;
 
-	map->cumulative_tsn_ack_point = ctsn - 1; /* Back up one. */
+	bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len);
 }
 
 /* How many data chunks  are we missing from our peer?
@@ -299,31 +267,19 @@ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map)
 	__u32 max_tsn = map->max_tsn_seen;
 	__u32 base_tsn = map->base_tsn;
 	__u16 pending_data;
-	__s32 gap, start, end, i;
+	u32 gap, i;
 
 	pending_data = max_tsn - cum_tsn;
 	gap = max_tsn - base_tsn;
 
-	if (gap <= 0 || gap >= (map->len + map->len))
+	if (gap == 0 || gap >= map->len)
 		goto out;
 
-	start = ((cum_tsn >= base_tsn) ? (cum_tsn - base_tsn + 1) : 0);
-	end = ((gap > map->len ) ? map->len : gap + 1);
-
-	for (i = start; i < end; i++) {
-		if (map->tsn_map[i])
+	for (i = 0; i < gap+1; i++) {
+		if (test_bit(i, map->tsn_map))
 			pending_data--;
 	}
 
-	if (gap >= map->len) {
-		start = 0;
-		end = gap - map->len + 1;
-		for (i = start; i < end; i++) {
-			if (map->overflow_map[i])
-				pending_data--;
-		}
-	}
-
 out:
 	return pending_data;
 }
@@ -334,10 +290,8 @@ out:
  * The flags "started" and "ended" tell is if we found the beginning
  * or (respectively) the end of a Gap Ack Block.
  */
-static void sctp_tsnmap_find_gap_ack(__u8 *map, __u16 off,
-				     __u16 len, __u16 base,
-				     int *started, __u16 *start,
-				     int *ended, __u16 *end)
+static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
+				     __u16 len, __u16 *start, __u16 *end)
 {
 	int i = off;
 
@@ -348,49 +302,36 @@ static void sctp_tsnmap_find_gap_ack(__u8 *map, __u16 off,
 	/* Also, stop looking past the maximum TSN seen. */
 
 	/* Look for the start. */
-	if (!(*started)) {
-		for (; i < len; i++) {
-			if (map[i]) {
-				(*started)++;
-				*start = base + i;
-				break;
-			}
-		}
-	}
+	i = find_next_bit(map, len, off);
+	if (i < len)
+		*start = i;
 
 	/* Look for the end.  */
-	if (*started) {
+	if (*start) {
 		/* We have found the start, let's find the
 		 * end.  If we find the end, break out.
 		 */
-		for (; i < len; i++) {
-			if (!map[i]) {
-				(*ended)++;
-				*end = base + i - 1;
-				break;
-			}
-		}
+		i = find_next_zero_bit(map, len, i);
+		if (i < len)
+			*end = i - 1;
 	}
 }
 
 /* Renege that we have seen a TSN.  */
 void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn)
 {
-	__s32 gap;
+	u32 gap;
 
 	if (TSN_lt(tsn, map->base_tsn))
 		return;
-	if (!TSN_lt(tsn, map->base_tsn + map->len + map->len))
+	/* Assert: TSN is in range.  */
+	if (!TSN_lt(tsn, map->base_tsn + map->len))
 		return;
 
-	/* Assert: TSN is in range.  */
 	gap = tsn - map->base_tsn;
 
 	/* Pretend we never saw the TSN.  */
-	if (gap < map->len)
-		map->tsn_map[gap] = 0;
-	else
-		map->overflow_map[gap - map->len] = 0;
+	clear_bit(gap, map->tsn_map);
 }
 
 /* How many gap ack blocks do we have recorded? */
@@ -416,3 +357,27 @@ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map)
 	}
 	return gabs;
 }
+
+static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap)
+{
+	unsigned long *new;
+	unsigned long inc;
+	u16  len;
+
+	if (gap >= SCTP_TSN_MAP_SIZE)
+		return 0;
+
+	inc = ALIGN((gap - map->len),BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT;
+	len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE);
+
+	new = kzalloc(len>>3, GFP_ATOMIC);
+	if (!new)
+		return 0;
+
+	bitmap_copy(new, map->tsn_map, map->max_tsn_seen - map->base_tsn);
+	kfree(map->tsn_map);
+	map->tsn_map = new;
+	map->len = len;
+
+	return 1;
+}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index a1f654aea26..5f186ca550d 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -713,7 +713,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	/* Now that all memory allocations for this chunk succeeded, we
 	 * can mark it as received so the tsn_map is updated correctly.
 	 */
-	sctp_tsnmap_mark(&asoc->peer.tsn_map, ntohl(chunk->subh.data_hdr->tsn));
+	if (sctp_tsnmap_mark(&asoc->peer.tsn_map,
+			     ntohl(chunk->subh.data_hdr->tsn)))
+		goto fail_mark;
 
 	/* First calculate the padding, so we don't inadvertently
 	 * pass up the wrong length to the user.
@@ -755,8 +757,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	event->msg_flags |= chunk->chunk_hdr->flags;
 	event->iif = sctp_chunk_iif(chunk);
 
-fail:
 	return event;
+
+fail_mark:
+	kfree_skb(skb);
+fail:
+	return NULL;
 }
 
 /* Create a partial delivery related event.
-- 
cgit v1.2.3


From 02015180e2509afd2e3fe3790a333b30708a116b Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Wed, 8 Oct 2008 14:19:01 -0700
Subject: sctp: shrink sctp_tsnmap some more by removing gabs array

The gabs array in the sctp_tsnmap structure is only used
in one place, sctp_make_sack().  As such, carrying the
array around in the sctp_tsnmap and thus directly in
the sctp_association is rather pointless since most
of the time it's just taking up space.  Now, let
sctp_make_sack create and populate it and then throw
it away when it's done.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c |  6 ++++--
 net/sctp/tsnmap.c        | 15 ++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 6dd9b3ef33d..fd8acb48c3f 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -702,12 +702,14 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 	__u32 ctsn;
 	__u16 num_gabs, num_dup_tsns;
 	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
+	struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
 
+	memset(gabs, 0, sizeof(gabs));
 	ctsn = sctp_tsnmap_get_ctsn(map);
 	SCTP_DEBUG_PRINTK("sackCTSNAck sent:  0x%x.\n", ctsn);
 
 	/* How much room is needed in the chunk? */
-	num_gabs = sctp_tsnmap_num_gabs(map);
+	num_gabs = sctp_tsnmap_num_gabs(map, gabs);
 	num_dup_tsns = sctp_tsnmap_num_dups(map);
 
 	/* Initialize the SACK header.  */
@@ -763,7 +765,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 	/* Add the gap ack block information.   */
 	if (num_gabs)
 		sctp_addto_chunk(retval, sizeof(__u32) * num_gabs,
-				 sctp_tsnmap_get_gabs(map));
+				 gabs);
 
 	/* Add the duplicate TSN information.  */
 	if (num_dup_tsns)
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index 142ed7ca424..35c73e82553 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -335,10 +335,11 @@ void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn)
 }
 
 /* How many gap ack blocks do we have recorded? */
-__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map)
+__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map,
+			   struct sctp_gap_ack_block *gabs)
 {
 	struct sctp_tsnmap_iter iter;
-	int gabs = 0;
+	int ngaps = 0;
 
 	/* Refresh the gap ack information. */
 	if (sctp_tsnmap_has_gap(map)) {
@@ -348,14 +349,14 @@ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map)
 						&start,
 						&end)) {
 
-			map->gabs[gabs].start = htons(start);
-			map->gabs[gabs].end = htons(end);
-			gabs++;
-			if (gabs >= SCTP_MAX_GABS)
+			gabs[ngaps].start = htons(start);
+			gabs[ngaps].end = htons(end);
+			ngaps++;
+			if (ngaps >= SCTP_MAX_GABS)
 				break;
 		}
 	}
-	return gabs;
+	return ngaps;
 }
 
 static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap)
-- 
cgit v1.2.3


From 53b125779fb0b29e5b316bf3dc7d199e6dcea567 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Wed, 8 Oct 2008 14:36:33 -0700
Subject: tcpv6: fix option space offsets with md5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

More breakage :-), part of timestamps just were previously
overwritten.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e8b0fdd9edb..dd7bdde7bdd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1088,7 +1088,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
 		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 				(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
 		*topt++ = htonl(tcp_time_stamp);
-		*topt = htonl(ts);
+		*topt++ = htonl(ts);
 	}
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
cgit v1.2.3


From 071d7ab6649eb34a873a53e71635186e9117101d Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Wed, 8 Oct 2008 14:41:35 -0700
Subject: ipvs: Remove stray file left over from ipvs move

Commit cb7f6a7b716e801097b564dec3ccb58d330aef56 ("IPVS: Move IPVS to
net/netfilter/ipvs") has left a stray file in the old location of ipvs.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_dh.c | 261 -----------------------------------------------
 1 file changed, 261 deletions(-)
 delete mode 100644 net/ipv4/ipvs/ip_vs_dh.c

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
deleted file mode 100644
index a16943fd72f..00000000000
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * IPVS:        Destination Hashing scheduling module
- *
- * Authors:     Wensong Zhang <wensong@gnuchina.org>
- *
- *              Inspired by the consistent hashing scheduler patch from
- *              Thomas Proell <proellt@gmx.de>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The dh algorithm is to select server by the hash key of destination IP
- * address. The pseudo code is as follows:
- *
- *       n <- servernode[dest_ip];
- *       if (n is dead) OR
- *          (n is overloaded) OR (n.weight <= 0) then
- *                 return NULL;
- *
- *       return n;
- *
- * Notes that servernode is a 256-bucket hash table that maps the hash
- * index derived from packet destination IP address to the current server
- * array. If the dh scheduler is used in cache cluster, it is good to
- * combine it with cache_bypass feature. When the statically assigned
- * server is dead or overloaded, the load balancer can bypass the cache
- * server and send requests to the original server directly.
- *
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *      IPVS DH bucket
- */
-struct ip_vs_dh_bucket {
-	struct ip_vs_dest       *dest;          /* real server (cache) */
-};
-
-/*
- *     for IPVS DH entry hash table
- */
-#ifndef CONFIG_IP_VS_DH_TAB_BITS
-#define CONFIG_IP_VS_DH_TAB_BITS        8
-#endif
-#define IP_VS_DH_TAB_BITS               CONFIG_IP_VS_DH_TAB_BITS
-#define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
-#define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
-
-
-/*
- *	Returns hash value for IPVS DH entry
- */
-static inline unsigned ip_vs_dh_hashkey(__be32 addr)
-{
-	return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
-}
-
-
-/*
- *      Get ip_vs_dest associated with supplied parameters.
- */
-static inline struct ip_vs_dest *
-ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr)
-{
-	return (tbl[ip_vs_dh_hashkey(addr)]).dest;
-}
-
-
-/*
- *      Assign all the hash buckets of the specified table with the service.
- */
-static int
-ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
-{
-	int i;
-	struct ip_vs_dh_bucket *b;
-	struct list_head *p;
-	struct ip_vs_dest *dest;
-
-	b = tbl;
-	p = &svc->destinations;
-	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
-		if (list_empty(p)) {
-			b->dest = NULL;
-		} else {
-			if (p == &svc->destinations)
-				p = p->next;
-
-			dest = list_entry(p, struct ip_vs_dest, n_list);
-			atomic_inc(&dest->refcnt);
-			b->dest = dest;
-
-			p = p->next;
-		}
-		b++;
-	}
-	return 0;
-}
-
-
-/*
- *      Flush all the hash buckets of the specified table.
- */
-static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
-{
-	int i;
-	struct ip_vs_dh_bucket *b;
-
-	b = tbl;
-	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
-		if (b->dest) {
-			atomic_dec(&b->dest->refcnt);
-			b->dest = NULL;
-		}
-		b++;
-	}
-}
-
-
-static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_dh_bucket *tbl;
-
-	/* allocate the DH table for this service */
-	tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
-		      GFP_ATOMIC);
-	if (tbl == NULL) {
-		IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
-		return -ENOMEM;
-	}
-	svc->sched_data = tbl;
-	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
-		  "current service\n",
-		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
-	/* assign the hash buckets with the updated service */
-	ip_vs_dh_assign(tbl, svc);
-
-	return 0;
-}
-
-
-static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
-	/* got to clean up hash buckets here */
-	ip_vs_dh_flush(tbl);
-
-	/* release the table itself */
-	kfree(svc->sched_data);
-	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
-		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
-	return 0;
-}
-
-
-static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
-{
-	struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
-	/* got to clean up hash buckets here */
-	ip_vs_dh_flush(tbl);
-
-	/* assign the hash buckets with the updated service */
-	ip_vs_dh_assign(tbl, svc);
-
-	return 0;
-}
-
-
-/*
- *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
-	return dest->flags & IP_VS_DEST_F_OVERLOAD;
-}
-
-
-/*
- *      Destination hashing scheduling
- */
-static struct ip_vs_dest *
-ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-	struct ip_vs_dest *dest;
-	struct ip_vs_dh_bucket *tbl;
-	struct iphdr *iph = ip_hdr(skb);
-
-	IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
-
-	tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
-	dest = ip_vs_dh_get(tbl, iph->daddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
-		return NULL;
-	}
-
-	IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
-		  "--> server %u.%u.%u.%u:%d\n",
-		  NIPQUAD(iph->daddr),
-		  NIPQUAD(dest->addr.ip),
-		  ntohs(dest->port));
-
-	return dest;
-}
-
-
-/*
- *      IPVS DH Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_dh_scheduler =
-{
-	.name =			"dh",
-	.refcnt =		ATOMIC_INIT(0),
-	.module =		THIS_MODULE,
-	.n_list =		LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-	.supports_ipv6 =	0,
-#endif
-	.init_service =		ip_vs_dh_init_svc,
-	.done_service =		ip_vs_dh_done_svc,
-	.update_service =	ip_vs_dh_update_svc,
-	.schedule =		ip_vs_dh_schedule,
-};
-
-
-static int __init ip_vs_dh_init(void)
-{
-	return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
-}
-
-
-static void __exit ip_vs_dh_cleanup(void)
-{
-	unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
-}
-
-
-module_init(ip_vs_dh_init);
-module_exit(ip_vs_dh_cleanup);
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 91da11f870f00a3322b81c73042291d7f0be5a17 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Tue, 7 Oct 2008 13:44:02 +0000
Subject: net: Distributed Switch Architecture protocol support

Distributed Switch Architecture is a protocol for managing hardware
switch chips.  It consists of a set of MII management registers and
commands to configure the switch, and an ethernet header format to
signal which of the ports of the switch a packet was received from
or is intended to be sent to.

The switches that this driver supports are typically embedded in
access points and routers, and a typical setup with a DSA switch
looks something like this:

	+-----------+       +-----------+
	|           | RGMII |           |
	|           +-------+           +------ 1000baseT MDI ("WAN")
	|           |       |  6-port   +------ 1000baseT MDI ("LAN1")
	|    CPU    |       |  ethernet +------ 1000baseT MDI ("LAN2")
	|           |MIImgmt|  switch   +------ 1000baseT MDI ("LAN3")
	|           +-------+  w/5 PHYs +------ 1000baseT MDI ("LAN4")
	|           |       |           |
	+-----------+       +-----------+

The switch driver presents each port on the switch as a separate
network interface to Linux, polls the switch to maintain software
link state of those ports, forwards MII management interface
accesses to those network interfaces (e.g. as done by ethtool) to
the switch, and exposes the switch's hardware statistics counters
via the appropriate Linux kernel interfaces.

This initial patch supports the MII management interface register
layout of the Marvell 88E6123, 88E6161 and 88E6165 switch chips, and
supports the "Ethertype DSA" packet tagging format.

(There is no officially registered ethertype for the Ethertype DSA
packet format, so we just grab a random one.  The ethertype to use
is programmed into the switch, and the switch driver uses the value
of ETH_P_EDSA for this, so this define can be changed at any time in
the future if the one we chose is allocated to another protocol or
if Ethertype DSA gets its own officially registered ethertype, and
everything will continue to work.)

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Nicolas Pitre <nico@marvell.com>
Tested-by: Byron Bradley <byron.bbradley@gmail.com>
Tested-by: Tim Ellis <tim.ellis@mac.com>
Tested-by: Peter van Valderen <linux@ddcrew.com>
Tested-by: Dirk Teurlings <dirk@upexia.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Kconfig               |   1 +
 net/Makefile              |   1 +
 net/dsa/Kconfig           |  31 ++++
 net/dsa/Makefile          |   9 +
 net/dsa/dsa.c             | 369 ++++++++++++++++++++++++++++++++++++++++
 net/dsa/dsa_priv.h        | 110 ++++++++++++
 net/dsa/mv88e6123_61_65.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++
 net/dsa/mv88e6xxx.c       | 377 +++++++++++++++++++++++++++++++++++++++++
 net/dsa/mv88e6xxx.h       |  77 +++++++++
 net/dsa/slave.c           | 288 ++++++++++++++++++++++++++++++++
 net/dsa/tag_edsa.c        | 213 +++++++++++++++++++++++
 11 files changed, 1893 insertions(+)
 create mode 100644 net/dsa/Kconfig
 create mode 100644 net/dsa/Makefile
 create mode 100644 net/dsa/dsa.c
 create mode 100644 net/dsa/dsa_priv.h
 create mode 100644 net/dsa/mv88e6123_61_65.c
 create mode 100644 net/dsa/mv88e6xxx.c
 create mode 100644 net/dsa/mv88e6xxx.h
 create mode 100644 net/dsa/slave.c
 create mode 100644 net/dsa/tag_edsa.c

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index 9103a16a77b..d789d79551a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -180,6 +180,7 @@ source "net/tipc/Kconfig"
 source "net/atm/Kconfig"
 source "net/802/Kconfig"
 source "net/bridge/Kconfig"
+source "net/dsa/Kconfig"
 source "net/8021q/Kconfig"
 source "net/decnet/Kconfig"
 source "net/llc/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index acaf819f24a..27d1f10dc0e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_NET_SCHED)		+= sched/
 obj-$(CONFIG_BRIDGE)		+= bridge/
+obj-$(CONFIG_NET_DSA)		+= dsa/
 obj-$(CONFIG_IPX)		+= ipx/
 obj-$(CONFIG_ATALK)		+= appletalk/
 obj-$(CONFIG_WAN_ROUTER)	+= wanrouter/
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
new file mode 100644
index 00000000000..7cf55e5eb39
--- /dev/null
+++ b/net/dsa/Kconfig
@@ -0,0 +1,31 @@
+menuconfig NET_DSA
+	bool "Distributed Switch Architecture support"
+	default n
+	depends on EXPERIMENTAL
+	---help---
+	  This allows you to use hardware switch chips that use
+	  the Distributed Switch Architecture.
+
+
+if NET_DSA
+
+# tagging formats
+config NET_DSA_TAG_EDSA
+	bool
+	default n
+
+
+# switch drivers
+config NET_DSA_MV88E6XXX
+	bool
+	default n
+
+config NET_DSA_MV88E6123_61_65
+	bool "Marvell 88E6123/6161/6165 ethernet switch chip support"
+	select NET_DSA_MV88E6XXX
+	select NET_DSA_TAG_EDSA
+	---help---
+	  This enables support for the Marvell 88E6123/6161/6165
+	  ethernet switch chips.
+
+endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
new file mode 100644
index 00000000000..b59a6f6bcf5
--- /dev/null
+++ b/net/dsa/Makefile
@@ -0,0 +1,9 @@
+# tagging formats
+obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
+
+# switch drivers
+obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
+obj-$(CONFIG_NET_DSA_MV88E6123_61_65) += mv88e6123_61_65.o
+
+# the core
+obj-$(CONFIG_NET_DSA) += dsa.o slave.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
new file mode 100644
index 00000000000..6cc5be2ec7f
--- /dev/null
+++ b/net/dsa/dsa.c
@@ -0,0 +1,369 @@
+/*
+ * net/dsa/dsa.c - Hardware switch handling
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <net/dsa.h>
+#include "dsa_priv.h"
+
+char dsa_driver_version[] = "0.1";
+
+
+/* switch driver registration ***********************************************/
+static DEFINE_MUTEX(dsa_switch_drivers_mutex);
+static LIST_HEAD(dsa_switch_drivers);
+
+void register_switch_driver(struct dsa_switch_driver *drv)
+{
+	mutex_lock(&dsa_switch_drivers_mutex);
+	list_add_tail(&drv->list, &dsa_switch_drivers);
+	mutex_unlock(&dsa_switch_drivers_mutex);
+}
+
+void unregister_switch_driver(struct dsa_switch_driver *drv)
+{
+	mutex_lock(&dsa_switch_drivers_mutex);
+	list_del_init(&drv->list);
+	mutex_unlock(&dsa_switch_drivers_mutex);
+}
+
+static struct dsa_switch_driver *
+dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name)
+{
+	struct dsa_switch_driver *ret;
+	struct list_head *list;
+	char *name;
+
+	ret = NULL;
+	name = NULL;
+
+	mutex_lock(&dsa_switch_drivers_mutex);
+	list_for_each(list, &dsa_switch_drivers) {
+		struct dsa_switch_driver *drv;
+
+		drv = list_entry(list, struct dsa_switch_driver, list);
+
+		name = drv->probe(bus, sw_addr);
+		if (name != NULL) {
+			ret = drv;
+			break;
+		}
+	}
+	mutex_unlock(&dsa_switch_drivers_mutex);
+
+	*_name = name;
+
+	return ret;
+}
+
+
+/* basic switch operations **************************************************/
+static struct dsa_switch *
+dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
+		 struct mii_bus *bus, struct net_device *dev)
+{
+	struct dsa_switch *ds;
+	int ret;
+	struct dsa_switch_driver *drv;
+	char *name;
+	int i;
+
+	/*
+	 * Probe for switch model.
+	 */
+	drv = dsa_switch_probe(bus, pd->sw_addr, &name);
+	if (drv == NULL) {
+		printk(KERN_ERR "%s: could not detect attached switch\n",
+		       dev->name);
+		return ERR_PTR(-EINVAL);
+	}
+	printk(KERN_INFO "%s: detected a %s switch\n", dev->name, name);
+
+
+	/*
+	 * Allocate and initialise switch state.
+	 */
+	ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL);
+	if (ds == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ds->pd = pd;
+	ds->master_netdev = dev;
+	ds->master_mii_bus = bus;
+
+	ds->drv = drv;
+	ds->tag_protocol = drv->tag_protocol;
+
+
+	/*
+	 * Validate supplied switch configuration.
+	 */
+	ds->cpu_port = -1;
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		char *name;
+
+		name = pd->port_names[i];
+		if (name == NULL)
+			continue;
+
+		if (!strcmp(name, "cpu")) {
+			if (ds->cpu_port != -1) {
+				printk(KERN_ERR "multiple cpu ports?!\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			ds->cpu_port = i;
+		} else {
+			ds->valid_port_mask |= 1 << i;
+		}
+	}
+
+	if (ds->cpu_port == -1) {
+		printk(KERN_ERR "no cpu port?!\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+
+	/*
+	 * If we use a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point on get
+	 * sent to the tag format's receive function.  (Which will
+	 * discard received packets until we set ds->ports[] below.)
+	 */
+	wmb();
+	dev->dsa_ptr = (void *)ds;
+
+
+	/*
+	 * Do basic register setup.
+	 */
+	ret = drv->setup(ds);
+	if (ret < 0)
+		goto out;
+
+	ret = drv->set_addr(ds, dev->dev_addr);
+	if (ret < 0)
+		goto out;
+
+	ds->slave_mii_bus = mdiobus_alloc();
+	if (ds->slave_mii_bus == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	dsa_slave_mii_bus_init(ds);
+
+	ret = mdiobus_register(ds->slave_mii_bus);
+	if (ret < 0)
+		goto out_free;
+
+
+	/*
+	 * Create network devices for physical switch ports.
+	 */
+	wmb();
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		struct net_device *slave_dev;
+
+		if (!(ds->valid_port_mask & (1 << i)))
+			continue;
+
+		slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]);
+		if (slave_dev == NULL) {
+			printk(KERN_ERR "%s: can't create dsa slave "
+			       "device for port %d(%s)\n",
+			       dev->name, i, pd->port_names[i]);
+			continue;
+		}
+
+		ds->ports[i] = slave_dev;
+	}
+
+	return ds;
+
+out_free:
+	mdiobus_free(ds->slave_mii_bus);
+out:
+	dev->dsa_ptr = NULL;
+	kfree(ds);
+	return ERR_PTR(ret);
+}
+
+static void dsa_switch_destroy(struct dsa_switch *ds)
+{
+}
+
+
+/* link polling *************************************************************/
+static void dsa_link_poll_work(struct work_struct *ugly)
+{
+	struct dsa_switch *ds;
+
+	ds = container_of(ugly, struct dsa_switch, link_poll_work);
+
+	ds->drv->poll_link(ds);
+	mod_timer(&ds->link_poll_timer, round_jiffies(jiffies + HZ));
+}
+
+static void dsa_link_poll_timer(unsigned long _ds)
+{
+	struct dsa_switch *ds = (void *)_ds;
+
+	schedule_work(&ds->link_poll_work);
+}
+
+
+/* platform driver init and cleanup *****************************************/
+static int dev_is_class(struct device *dev, void *class)
+{
+	if (dev->class != NULL && !strcmp(dev->class->name, class))
+		return 1;
+
+	return 0;
+}
+
+static struct device *dev_find_class(struct device *parent, char *class)
+{
+	if (dev_is_class(parent, class)) {
+		get_device(parent);
+		return parent;
+	}
+
+	return device_find_child(parent, class, dev_is_class);
+}
+
+static struct mii_bus *dev_to_mii_bus(struct device *dev)
+{
+	struct device *d;
+
+	d = dev_find_class(dev, "mdio_bus");
+	if (d != NULL) {
+		struct mii_bus *bus;
+
+		bus = to_mii_bus(d);
+		put_device(d);
+
+		return bus;
+	}
+
+	return NULL;
+}
+
+static struct net_device *dev_to_net_device(struct device *dev)
+{
+	struct device *d;
+
+	d = dev_find_class(dev, "net");
+	if (d != NULL) {
+		struct net_device *nd;
+
+		nd = to_net_dev(d);
+		dev_hold(nd);
+		put_device(d);
+
+		return nd;
+	}
+
+	return NULL;
+}
+
+static int dsa_probe(struct platform_device *pdev)
+{
+	static int dsa_version_printed;
+	struct dsa_platform_data *pd = pdev->dev.platform_data;
+	struct net_device *dev;
+	struct mii_bus *bus;
+	struct dsa_switch *ds;
+
+	if (!dsa_version_printed++)
+		printk(KERN_NOTICE "Distributed Switch Architecture "
+			"driver version %s\n", dsa_driver_version);
+
+	if (pd == NULL || pd->mii_bus == NULL || pd->netdev == NULL)
+		return -EINVAL;
+
+	bus = dev_to_mii_bus(pd->mii_bus);
+	if (bus == NULL)
+		return -EINVAL;
+
+	dev = dev_to_net_device(pd->netdev);
+	if (dev == NULL)
+		return -EINVAL;
+
+	if (dev->dsa_ptr != NULL) {
+		dev_put(dev);
+		return -EEXIST;
+	}
+
+	ds = dsa_switch_setup(&pdev->dev, pd, bus, dev);
+	if (IS_ERR(ds)) {
+		dev_put(dev);
+		return PTR_ERR(ds);
+	}
+
+	if (ds->drv->poll_link != NULL) {
+		INIT_WORK(&ds->link_poll_work, dsa_link_poll_work);
+		init_timer(&ds->link_poll_timer);
+		ds->link_poll_timer.data = (unsigned long)ds;
+		ds->link_poll_timer.function = dsa_link_poll_timer;
+		ds->link_poll_timer.expires = round_jiffies(jiffies + HZ);
+		add_timer(&ds->link_poll_timer);
+	}
+
+	platform_set_drvdata(pdev, ds);
+
+	return 0;
+}
+
+static int dsa_remove(struct platform_device *pdev)
+{
+	struct dsa_switch *ds = platform_get_drvdata(pdev);
+
+	if (ds->drv->poll_link != NULL)
+		del_timer_sync(&ds->link_poll_timer);
+
+	flush_scheduled_work();
+
+	dsa_switch_destroy(ds);
+
+	return 0;
+}
+
+static void dsa_shutdown(struct platform_device *pdev)
+{
+}
+
+static struct platform_driver dsa_driver = {
+	.probe		= dsa_probe,
+	.remove		= dsa_remove,
+	.shutdown	= dsa_shutdown,
+	.driver = {
+		.name	= "dsa",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init dsa_init_module(void)
+{
+	return platform_driver_register(&dsa_driver);
+}
+module_init(dsa_init_module);
+
+static void __exit dsa_cleanup_module(void)
+{
+	platform_driver_unregister(&dsa_driver);
+}
+module_exit(dsa_cleanup_module);
+
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>")
+MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:dsa");
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
new file mode 100644
index 00000000000..21ee9052079
--- /dev/null
+++ b/net/dsa/dsa_priv.h
@@ -0,0 +1,110 @@
+/*
+ * net/dsa/dsa_priv.h - Hardware switch handling
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __DSA_PRIV_H
+#define __DSA_PRIV_H
+
+#include <linux/list.h>
+#include <linux/phy.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <net/dsa.h>
+
+struct dsa_switch {
+	/*
+	 * Configuration data for the platform device that owns
+	 * this dsa switch instance.
+	 */
+	struct dsa_platform_data	*pd;
+
+	/*
+	 * References to network device and mii bus to use.
+	 */
+	struct net_device		*master_netdev;
+	struct mii_bus			*master_mii_bus;
+
+	/*
+	 * The used switch driver and frame tagging type.
+	 */
+	struct dsa_switch_driver	*drv;
+	__be16				tag_protocol;
+
+	/*
+	 * Slave mii_bus and devices for the individual ports.
+	 */
+	int				cpu_port;
+	u32				valid_port_mask;
+	struct mii_bus			*slave_mii_bus;
+	struct net_device		*ports[DSA_MAX_PORTS];
+
+	/*
+	 * Link state polling.
+	 */
+	struct work_struct		link_poll_work;
+	struct timer_list		link_poll_timer;
+};
+
+struct dsa_slave_priv {
+	struct net_device	*dev;
+	struct dsa_switch	*parent;
+	int			port;
+	struct phy_device	*phy;
+};
+
+struct dsa_switch_driver {
+	struct list_head	list;
+
+	__be16			tag_protocol;
+	int			priv_size;
+
+	/*
+	 * Probing and setup.
+	 */
+	char	*(*probe)(struct mii_bus *bus, int sw_addr);
+	int	(*setup)(struct dsa_switch *ds);
+	int	(*set_addr)(struct dsa_switch *ds, u8 *addr);
+
+	/*
+	 * Access to the switch's PHY registers.
+	 */
+	int	(*phy_read)(struct dsa_switch *ds, int port, int regnum);
+	int	(*phy_write)(struct dsa_switch *ds, int port,
+			     int regnum, u16 val);
+
+	/*
+	 * Link state polling and IRQ handling.
+	 */
+	void	(*poll_link)(struct dsa_switch *ds);
+
+	/*
+	 * ethtool hardware statistics.
+	 */
+	void	(*get_strings)(struct dsa_switch *ds, int port, uint8_t *data);
+	void	(*get_ethtool_stats)(struct dsa_switch *ds,
+				     int port, uint64_t *data);
+	int	(*get_sset_count)(struct dsa_switch *ds);
+};
+
+/* dsa.c */
+extern char dsa_driver_version[];
+void register_switch_driver(struct dsa_switch_driver *type);
+void unregister_switch_driver(struct dsa_switch_driver *type);
+
+/* slave.c */
+void dsa_slave_mii_bus_init(struct dsa_switch *ds);
+struct net_device *dsa_slave_create(struct dsa_switch *ds,
+				    struct device *parent,
+				    int port, char *name);
+
+/* tag_edsa.c */
+int edsa_xmit(struct sk_buff *skb, struct net_device *dev);
+
+
+#endif
diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c
new file mode 100644
index 00000000000..147818cc706
--- /dev/null
+++ b/net/dsa/mv88e6123_61_65.c
@@ -0,0 +1,417 @@
+/*
+ * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+#include "mv88e6xxx.h"
+
+static char *mv88e6123_61_65_probe(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+
+	ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03);
+	if (ret >= 0) {
+		ret &= 0xfff0;
+		if (ret == 0x1210)
+			return "Marvell 88E6123";
+		if (ret == 0x1610)
+			return "Marvell 88E6161";
+		if (ret == 0x1650)
+			return "Marvell 88E6165";
+	}
+
+	return NULL;
+}
+
+static int mv88e6123_61_65_switch_reset(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Set all ports to the disabled state.
+	 */
+	for (i = 0; i < 8; i++) {
+		ret = REG_READ(REG_PORT(i), 0x04);
+		REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
+	}
+
+	/*
+	 * Wait for transmit queues to drain.
+	 */
+	msleep(2);
+
+	/*
+	 * Reset the switch.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0xc400);
+
+	/*
+	 * Wait up to one second for reset to complete.
+	 */
+	for (i = 0; i < 1000; i++) {
+		ret = REG_READ(REG_GLOBAL, 0x00);
+		if ((ret & 0xc800) == 0xc800)
+			break;
+
+		msleep(1);
+	}
+	if (i == 1000)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	/*
+	 * Disable the PHY polling unit (since there won't be any
+	 * external PHYs to poll), don't discard packets with
+	 * excessive collisions, and mask all interrupt sources.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0x0000);
+
+	/*
+	 * Set the default address aging time to 5 minutes, and
+	 * enable address learn messages to be sent to all message
+	 * ports.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0a, 0x0148);
+
+	/*
+	 * Configure the priority mapping registers.
+	 */
+	ret = mv88e6xxx_config_prio(ds);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Configure the cpu port, and configure the cpu port as the
+	 * port to which ingress and egress monitor frames are to be
+	 * sent.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1110));
+
+	/*
+	 * Disable remote management for now, and set the switch's
+	 * DSA device number to zero.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1c, 0x0000);
+
+	/*
+	 * Send all frames with destination addresses matching
+	 * 01:80:c2:00:00:2x to the CPU port.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x02, 0xffff);
+
+	/*
+	 * Send all frames with destination addresses matching
+	 * 01:80:c2:00:00:0x to the CPU port.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x03, 0xffff);
+
+	/*
+	 * Disable the loopback filter, disable flow control
+	 * messages, disable flood broadcast override, disable
+	 * removing of provider tags, disable ATU age violation
+	 * interrupts, disable tag flow control, force flow
+	 * control priority to the highest, and send all special
+	 * multicast frames to the CPU at the highest priority.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
+
+	/*
+	 * Map all DSA device IDs to the CPU port.
+	 */
+	for (i = 0; i < 32; i++)
+		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port);
+
+	/*
+	 * Clear all trunk masks.
+	 */
+	for (i = 0; i < 8; i++)
+		REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff);
+
+	/*
+	 * Clear all trunk mappings.
+	 */
+	for (i = 0; i < 16; i++)
+		REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11));
+
+	/*
+	 * Disable ingress rate limiting by resetting all ingress
+	 * rate limit registers to their initial state.
+	 */
+	for (i = 0; i < 6; i++)
+		REG_WRITE(REG_GLOBAL2, 0x09, 0x9000 | (i << 8));
+
+	/*
+	 * Initialise cross-chip port VLAN table to reset defaults.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x0b, 0x9000);
+
+	/*
+	 * Clear the priority override table.
+	 */
+	for (i = 0; i < 16; i++)
+		REG_WRITE(REG_GLOBAL2, 0x0f, 0x8000 | (i << 8));
+
+	/* @@@ initialise AVB (22/23) watchdog (27) sdet (29) registers */
+
+	return 0;
+}
+
+static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
+{
+	int addr = REG_PORT(p);
+
+	/*
+	 * MAC Forcing register: don't force link, speed, duplex
+	 * or flow control state to any particular values.
+	 */
+	REG_WRITE(addr, 0x01, 0x0003);
+
+	/*
+	 * Do not limit the period of time that this port can be
+	 * paused for by the remote end or the period of time that
+	 * this port can pause the remote end.
+	 */
+	REG_WRITE(addr, 0x02, 0x0000);
+
+	/*
+	 * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock,
+	 * configure the EDSA tagging mode if this is the CPU port,
+	 * disable Header mode, enable IGMP/MLD snooping, disable VLAN
+	 * tunneling, determine priority by looking at 802.1p and IP
+	 * priority fields (IP prio has precedence), and set STP state
+	 * to Forwarding.  Finally, if this is the CPU port, additionally
+	 * enable forwarding of unknown unicast and multicast addresses.
+	 */
+	REG_WRITE(addr, 0x04,
+			(p == ds->cpu_port) ? 0x373f : 0x0433);
+
+	/*
+	 * Port Control 1: disable trunking.  Also, if this is the
+	 * CPU port, enable learn messages to be sent to this port.
+	 */
+	REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000);
+
+	/*
+	 * Port based VLAN map: give each port its own address
+	 * database, allow the CPU port to talk to each of the 'real'
+	 * ports, and allow each of the 'real' ports to only talk to
+	 * the CPU port.
+	 */
+	REG_WRITE(addr, 0x06,
+			((p & 0xf) << 12) |
+			 ((p == ds->cpu_port) ?
+				ds->valid_port_mask :
+				(1 << ds->cpu_port)));
+
+	/*
+	 * Default VLAN ID and priority: don't set a default VLAN
+	 * ID, and set the default packet priority to zero.
+	 */
+	REG_WRITE(addr, 0x07, 0x0000);
+
+	/*
+	 * Port Control 2: don't force a good FCS, set the maximum
+	 * frame size to 10240 bytes, don't let the switch add or
+	 * strip 802.1q tags, don't discard tagged or untagged frames
+	 * on this port, do a destination address lookup on all
+	 * received packets as usual, disable ARP mirroring and don't
+	 * send a copy of all transmitted/received frames on this port
+	 * to the CPU.
+	 */
+	REG_WRITE(addr, 0x08, 0x2080);
+
+	/*
+	 * Egress rate control: disable egress rate control.
+	 */
+	REG_WRITE(addr, 0x09, 0x0001);
+
+	/*
+	 * Egress rate control 2: disable egress rate control.
+	 */
+	REG_WRITE(addr, 0x0a, 0x0000);
+
+	/*
+	 * Port Association Vector: when learning source addresses
+	 * of packets, add the address to the address database using
+	 * a port bitmap that has only the bit for this port set and
+	 * the other bits clear.
+	 */
+	REG_WRITE(addr, 0x0b, 1 << p);
+
+	/*
+	 * Port ATU control: disable limiting the number of address
+	 * database entries that this port is allowed to use.
+	 */
+	REG_WRITE(addr, 0x0c, 0x0000);
+
+	/*
+	 * Priorit Override: disable DA, SA and VTU priority override.
+	 */
+	REG_WRITE(addr, 0x0d, 0x0000);
+
+	/*
+	 * Port Ethertype: use the Ethertype DSA Ethertype value.
+	 */
+	REG_WRITE(addr, 0x0f, ETH_P_EDSA);
+
+	/*
+	 * Tag Remap: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x18, 0x3210);
+
+	/*
+	 * Tag Remap 2: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x19, 0x7654);
+
+	return 0;
+}
+
+static int mv88e6123_61_65_setup(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int i;
+	int ret;
+
+	mutex_init(&ps->smi_mutex);
+	mutex_init(&ps->stats_mutex);
+
+	ret = mv88e6123_61_65_switch_reset(ds);
+	if (ret < 0)
+		return ret;
+
+	/* @@@ initialise vtu and atu */
+
+	ret = mv88e6123_61_65_setup_global(ds);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < 6; i++) {
+		ret = mv88e6123_61_65_setup_port(ds, i);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mv88e6123_61_65_port_to_phy_addr(int port)
+{
+	if (port >= 0 && port <= 4)
+		return port;
+	return -1;
+}
+
+static int
+mv88e6123_61_65_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+	int addr = mv88e6123_61_65_port_to_phy_addr(port);
+	return mv88e6xxx_phy_read(ds, addr, regnum);
+}
+
+static int
+mv88e6123_61_65_phy_write(struct dsa_switch *ds,
+			      int port, int regnum, u16 val)
+{
+	int addr = mv88e6123_61_65_port_to_phy_addr(port);
+	return mv88e6xxx_phy_write(ds, addr, regnum, val);
+}
+
+static struct mv88e6xxx_hw_stat mv88e6123_61_65_hw_stats[] = {
+	{ "in_good_octets", 8, 0x00, },
+	{ "in_bad_octets", 4, 0x02, },
+	{ "in_unicast", 4, 0x04, },
+	{ "in_broadcasts", 4, 0x06, },
+	{ "in_multicasts", 4, 0x07, },
+	{ "in_pause", 4, 0x16, },
+	{ "in_undersize", 4, 0x18, },
+	{ "in_fragments", 4, 0x19, },
+	{ "in_oversize", 4, 0x1a, },
+	{ "in_jabber", 4, 0x1b, },
+	{ "in_rx_error", 4, 0x1c, },
+	{ "in_fcs_error", 4, 0x1d, },
+	{ "out_octets", 8, 0x0e, },
+	{ "out_unicast", 4, 0x10, },
+	{ "out_broadcasts", 4, 0x13, },
+	{ "out_multicasts", 4, 0x12, },
+	{ "out_pause", 4, 0x15, },
+	{ "excessive", 4, 0x11, },
+	{ "collisions", 4, 0x1e, },
+	{ "deferred", 4, 0x05, },
+	{ "single", 4, 0x14, },
+	{ "multiple", 4, 0x17, },
+	{ "out_fcs_error", 4, 0x03, },
+	{ "late", 4, 0x1f, },
+	{ "hist_64bytes", 4, 0x08, },
+	{ "hist_65_127bytes", 4, 0x09, },
+	{ "hist_128_255bytes", 4, 0x0a, },
+	{ "hist_256_511bytes", 4, 0x0b, },
+	{ "hist_512_1023bytes", 4, 0x0c, },
+	{ "hist_1024_max_bytes", 4, 0x0d, },
+};
+
+static void
+mv88e6123_61_65_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+{
+	mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats),
+			      mv88e6123_61_65_hw_stats, port, data);
+}
+
+static void
+mv88e6123_61_65_get_ethtool_stats(struct dsa_switch *ds,
+				  int port, uint64_t *data)
+{
+	mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats),
+				    mv88e6123_61_65_hw_stats, port, data);
+}
+
+static int mv88e6123_61_65_get_sset_count(struct dsa_switch *ds)
+{
+	return ARRAY_SIZE(mv88e6123_61_65_hw_stats);
+}
+
+static struct dsa_switch_driver mv88e6123_61_65_switch_driver = {
+	.tag_protocol		= __constant_htons(ETH_P_EDSA),
+	.priv_size		= sizeof(struct mv88e6xxx_priv_state),
+	.probe			= mv88e6123_61_65_probe,
+	.setup			= mv88e6123_61_65_setup,
+	.set_addr		= mv88e6xxx_set_addr_indirect,
+	.phy_read		= mv88e6123_61_65_phy_read,
+	.phy_write		= mv88e6123_61_65_phy_write,
+	.poll_link		= mv88e6xxx_poll_link,
+	.get_strings		= mv88e6123_61_65_get_strings,
+	.get_ethtool_stats	= mv88e6123_61_65_get_ethtool_stats,
+	.get_sset_count		= mv88e6123_61_65_get_sset_count,
+};
+
+int __init mv88e6123_61_65_init(void)
+{
+	register_switch_driver(&mv88e6123_61_65_switch_driver);
+	return 0;
+}
+module_init(mv88e6123_61_65_init);
+
+void __exit mv88e6123_61_65_cleanup(void)
+{
+	unregister_switch_driver(&mv88e6123_61_65_switch_driver);
+}
+module_exit(mv88e6123_61_65_cleanup);
diff --git a/net/dsa/mv88e6xxx.c b/net/dsa/mv88e6xxx.c
new file mode 100644
index 00000000000..13d2328a240
--- /dev/null
+++ b/net/dsa/mv88e6xxx.c
@@ -0,0 +1,377 @@
+/*
+ * net/dsa/mv88e6xxx.c - Marvell 88e6xxx switch chip support
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+#include "mv88e6xxx.h"
+
+/*
+ * If the switch's ADDR[4:0] strap pins are strapped to zero, it will
+ * use all 32 SMI bus addresses on its SMI bus, and all switch registers
+ * will be directly accessible on some {device address,register address}
+ * pair.  If the ADDR[4:0] pins are not strapped to zero, the switch
+ * will only respond to SMI transactions to that specific address, and
+ * an indirect addressing mechanism needs to be used to access its
+ * registers.
+ */
+static int mv88e6xxx_reg_wait_ready(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		ret = mdiobus_read(bus, sw_addr, 0);
+		if (ret < 0)
+			return ret;
+
+		if ((ret & 0x8000) == 0)
+			return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg)
+{
+	int ret;
+
+	if (sw_addr == 0)
+		return mdiobus_read(bus, addr, reg);
+
+	/*
+	 * Wait for the bus to become free.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Transmit the read command.
+	 */
+	ret = mdiobus_write(bus, sw_addr, 0, 0x9800 | (addr << 5) | reg);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Wait for the read command to complete.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Read the data.
+	 */
+	ret = mdiobus_read(bus, sw_addr, 1);
+	if (ret < 0)
+		return ret;
+
+	return ret & 0xffff;
+}
+
+int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+
+	mutex_lock(&ps->smi_mutex);
+	ret = __mv88e6xxx_reg_read(ds->master_mii_bus,
+				   ds->pd->sw_addr, addr, reg);
+	mutex_unlock(&ps->smi_mutex);
+
+	return ret;
+}
+
+int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr,
+			  int reg, u16 val)
+{
+	int ret;
+
+	if (sw_addr == 0)
+		return mdiobus_write(bus, addr, reg, val);
+
+	/*
+	 * Wait for the bus to become free.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Transmit the data to write.
+	 */
+	ret = mdiobus_write(bus, sw_addr, 1, val);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Transmit the write command.
+	 */
+	ret = mdiobus_write(bus, sw_addr, 0, 0x9400 | (addr << 5) | reg);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Wait for the write command to complete.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+
+	mutex_lock(&ps->smi_mutex);
+	ret = __mv88e6xxx_reg_write(ds->master_mii_bus,
+				    ds->pd->sw_addr, addr, reg, val);
+	mutex_unlock(&ps->smi_mutex);
+
+	return ret;
+}
+
+int mv88e6xxx_config_prio(struct dsa_switch *ds)
+{
+	/*
+	 * Configure the IP ToS mapping registers.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x10, 0x0000);
+	REG_WRITE(REG_GLOBAL, 0x11, 0x0000);
+	REG_WRITE(REG_GLOBAL, 0x12, 0x5555);
+	REG_WRITE(REG_GLOBAL, 0x13, 0x5555);
+	REG_WRITE(REG_GLOBAL, 0x14, 0xaaaa);
+	REG_WRITE(REG_GLOBAL, 0x15, 0xaaaa);
+	REG_WRITE(REG_GLOBAL, 0x16, 0xffff);
+	REG_WRITE(REG_GLOBAL, 0x17, 0xffff);
+
+	/*
+	 * Configure the IEEE 802.1p priority mapping register.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x18, 0xfa41);
+
+	return 0;
+}
+
+int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < 6; i++) {
+		int j;
+
+		/*
+		 * Write the MAC address byte.
+		 */
+		REG_WRITE(REG_GLOBAL2, 0x0d, 0x8000 | (i << 8) | addr[i]);
+
+		/*
+		 * Wait for the write to complete.
+		 */
+		for (j = 0; j < 16; j++) {
+			ret = REG_READ(REG_GLOBAL2, 0x0d);
+			if ((ret & 0x8000) == 0)
+				break;
+		}
+		if (j == 16)
+			return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum)
+{
+	if (addr >= 0)
+		return mv88e6xxx_reg_read(ds, addr, regnum);
+	return 0xffff;
+}
+
+int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val)
+{
+	if (addr >= 0)
+		return mv88e6xxx_reg_write(ds, addr, regnum, val);
+	return 0;
+}
+
+void mv88e6xxx_poll_link(struct dsa_switch *ds)
+{
+	int i;
+
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		struct net_device *dev;
+		int port_status;
+		int link;
+		int speed;
+		int duplex;
+		int fc;
+
+		dev = ds->ports[i];
+		if (dev == NULL)
+			continue;
+
+		link = 0;
+		if (dev->flags & IFF_UP) {
+			port_status = mv88e6xxx_reg_read(ds, REG_PORT(i), 0x00);
+			if (port_status < 0)
+				continue;
+
+			link = !!(port_status & 0x0800);
+		}
+
+		if (!link) {
+			if (netif_carrier_ok(dev)) {
+				printk(KERN_INFO "%s: link down\n", dev->name);
+				netif_carrier_off(dev);
+			}
+			continue;
+		}
+
+		switch (port_status & 0x0300) {
+		case 0x0000:
+			speed = 10;
+			break;
+		case 0x0100:
+			speed = 100;
+			break;
+		case 0x0200:
+			speed = 1000;
+			break;
+		default:
+			speed = -1;
+			break;
+		}
+		duplex = (port_status & 0x0400) ? 1 : 0;
+		fc = (port_status & 0x8000) ? 1 : 0;
+
+		if (!netif_carrier_ok(dev)) {
+			printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, "
+					 "flow control %sabled\n", dev->name,
+					 speed, duplex ? "full" : "half",
+					 fc ? "en" : "dis");
+			netif_carrier_on(dev);
+		}
+	}
+}
+
+static int mv88e6xxx_stats_wait(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		ret = REG_READ(REG_GLOBAL2, 0x1d);
+		if ((ret & 0x8000) == 0)
+			return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int mv88e6xxx_stats_snapshot(struct dsa_switch *ds, int port)
+{
+	int ret;
+
+	/*
+	 * Snapshot the hardware statistics counters for this port.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1d, 0xdc00 | port);
+
+	/*
+	 * Wait for the snapshotting to complete.
+	 */
+	ret = mv88e6xxx_stats_wait(ds);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void mv88e6xxx_stats_read(struct dsa_switch *ds, int stat, u32 *val)
+{
+	u32 _val;
+	int ret;
+
+	*val = 0;
+
+	ret = mv88e6xxx_reg_write(ds, REG_GLOBAL, 0x1d, 0xcc00 | stat);
+	if (ret < 0)
+		return;
+
+	ret = mv88e6xxx_stats_wait(ds);
+	if (ret < 0)
+		return;
+
+	ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1e);
+	if (ret < 0)
+		return;
+
+	_val = ret << 16;
+
+	ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1f);
+	if (ret < 0)
+		return;
+
+	*val = _val | ret;
+}
+
+void mv88e6xxx_get_strings(struct dsa_switch *ds,
+			   int nr_stats, struct mv88e6xxx_hw_stat *stats,
+			   int port, uint8_t *data)
+{
+	int i;
+
+	for (i = 0; i < nr_stats; i++) {
+		memcpy(data + i * ETH_GSTRING_LEN,
+		       stats[i].string, ETH_GSTRING_LEN);
+	}
+}
+
+void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds,
+				 int nr_stats, struct mv88e6xxx_hw_stat *stats,
+				 int port, uint64_t *data)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+	int i;
+
+	mutex_lock(&ps->stats_mutex);
+
+	ret = mv88e6xxx_stats_snapshot(ds, port);
+	if (ret < 0) {
+		mutex_unlock(&ps->stats_mutex);
+		return;
+	}
+
+	/*
+	 * Read each of the counters.
+	 */
+	for (i = 0; i < nr_stats; i++) {
+		struct mv88e6xxx_hw_stat *s = stats + i;
+		u32 low;
+		u32 high;
+
+		mv88e6xxx_stats_read(ds, s->reg, &low);
+		if (s->sizeof_stat == 8)
+			mv88e6xxx_stats_read(ds, s->reg + 1, &high);
+		else
+			high = 0;
+
+		data[i] = (((u64)high) << 32) | low;
+	}
+
+	mutex_unlock(&ps->stats_mutex);
+}
diff --git a/net/dsa/mv88e6xxx.h b/net/dsa/mv88e6xxx.h
new file mode 100644
index 00000000000..a004d4d0208
--- /dev/null
+++ b/net/dsa/mv88e6xxx.h
@@ -0,0 +1,77 @@
+/*
+ * net/dsa/mv88e6xxx.h - Marvell 88e6xxx switch chip support
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __MV88E6XXX_H
+#define __MV88E6XXX_H
+
+#define REG_PORT(p)		(0x10 + (p))
+#define REG_GLOBAL		0x1b
+#define REG_GLOBAL2		0x1c
+
+struct mv88e6xxx_priv_state {
+	/*
+	 * When using multi-chip addressing, this mutex protects
+	 * access to the indirect access registers.  (In single-chip
+	 * mode, this mutex is effectively useless.)
+	 */
+	struct mutex	smi_mutex;
+
+	/*
+	 * This mutex serialises access to the statistics unit.
+	 * Hold this mutex over snapshot + dump sequences.
+	 */
+	struct mutex	stats_mutex;
+};
+
+struct mv88e6xxx_hw_stat {
+	char string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int reg;
+};
+
+int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg);
+int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg);
+int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr,
+                          int reg, u16 val);
+int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val);
+int mv88e6xxx_config_prio(struct dsa_switch *ds);
+int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr);
+int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum);
+int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val);
+void mv88e6xxx_poll_link(struct dsa_switch *ds);
+void mv88e6xxx_get_strings(struct dsa_switch *ds,
+			   int nr_stats, struct mv88e6xxx_hw_stat *stats,
+			   int port, uint8_t *data);
+void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds,
+				 int nr_stats, struct mv88e6xxx_hw_stat *stats,
+				 int port, uint64_t *data);
+
+#define REG_READ(addr, reg)						\
+	({								\
+		int __ret;						\
+									\
+		__ret = mv88e6xxx_reg_read(ds, addr, reg);		\
+		if (__ret < 0)						\
+			return __ret;					\
+		__ret;							\
+	})
+
+#define REG_WRITE(addr, reg, val)					\
+	({								\
+		int __ret;						\
+									\
+		__ret = mv88e6xxx_reg_write(ds, addr, reg, val);	\
+		if (__ret < 0)						\
+			return __ret;					\
+	})
+
+
+
+#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
new file mode 100644
index 00000000000..3cb331e98b8
--- /dev/null
+++ b/net/dsa/slave.c
@@ -0,0 +1,288 @@
+/*
+ * net/dsa/slave.c - Slave device handling
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+
+/* slave mii_bus handling ***************************************************/
+static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->valid_port_mask & (1 << addr))
+		return ds->drv->phy_read(ds, addr, reg);
+
+	return 0xffff;
+}
+
+static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->valid_port_mask & (1 << addr))
+		return ds->drv->phy_write(ds, addr, reg, val);
+
+	return 0;
+}
+
+void dsa_slave_mii_bus_init(struct dsa_switch *ds)
+{
+	ds->slave_mii_bus->priv = (void *)ds;
+	ds->slave_mii_bus->name = "dsa slave smi";
+	ds->slave_mii_bus->read = dsa_slave_phy_read;
+	ds->slave_mii_bus->write = dsa_slave_phy_write;
+	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x",
+			ds->master_mii_bus->id, ds->pd->sw_addr);
+	ds->slave_mii_bus->parent = &(ds->master_mii_bus->dev);
+}
+
+
+/* slave device handling ****************************************************/
+static int dsa_slave_open(struct net_device *dev)
+{
+	return 0;
+}
+
+static int dsa_slave_close(struct net_device *dev)
+{
+	return 0;
+}
+
+static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct net_device *master = p->parent->master_netdev;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
+	if (change & IFF_PROMISC)
+		dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1);
+}
+
+static void dsa_slave_set_rx_mode(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct net_device *master = p->parent->master_netdev;
+
+	dev_mc_sync(master, dev);
+	dev_unicast_sync(master, dev);
+}
+
+static int dsa_slave_set_mac_address(struct net_device *dev, void *addr)
+{
+	memcpy(dev->dev_addr, addr + 2, 6);
+
+	return 0;
+}
+
+static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct mii_ioctl_data *mii_data = if_mii(ifr);
+
+	if (p->phy != NULL)
+		return phy_mii_ioctl(p->phy, mii_data, cmd);
+
+	return -EOPNOTSUPP;
+}
+
+
+/* ethtool operations *******************************************************/
+static int
+dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	int err;
+
+	err = -EOPNOTSUPP;
+	if (p->phy != NULL) {
+		err = phy_read_status(p->phy);
+		if (err == 0)
+			err = phy_ethtool_gset(p->phy, cmd);
+	}
+
+	return err;
+}
+
+static int
+dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (p->phy != NULL)
+		return phy_ethtool_sset(p->phy, cmd);
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_slave_get_drvinfo(struct net_device *dev,
+				  struct ethtool_drvinfo *drvinfo)
+{
+	strncpy(drvinfo->driver, "dsa", 32);
+	strncpy(drvinfo->version, dsa_driver_version, 32);
+	strncpy(drvinfo->fw_version, "N/A", 32);
+	strncpy(drvinfo->bus_info, "platform", 32);
+}
+
+static int dsa_slave_nway_reset(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (p->phy != NULL)
+		return genphy_restart_aneg(p->phy);
+
+	return -EOPNOTSUPP;
+}
+
+static u32 dsa_slave_get_link(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (p->phy != NULL) {
+		genphy_update_link(p->phy);
+		return p->phy->link;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_slave_get_strings(struct net_device *dev,
+				  uint32_t stringset, uint8_t *data)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	if (stringset == ETH_SS_STATS) {
+		int len = ETH_GSTRING_LEN;
+
+		strncpy(data, "tx_packets", len);
+		strncpy(data + len, "tx_bytes", len);
+		strncpy(data + 2 * len, "rx_packets", len);
+		strncpy(data + 3 * len, "rx_bytes", len);
+		if (ds->drv->get_strings != NULL)
+			ds->drv->get_strings(ds, p->port, data + 4 * len);
+	}
+}
+
+static void dsa_slave_get_ethtool_stats(struct net_device *dev,
+					struct ethtool_stats *stats,
+					uint64_t *data)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	data[0] = p->dev->stats.tx_packets;
+	data[1] = p->dev->stats.tx_bytes;
+	data[2] = p->dev->stats.rx_packets;
+	data[3] = p->dev->stats.rx_bytes;
+	if (ds->drv->get_ethtool_stats != NULL)
+		ds->drv->get_ethtool_stats(ds, p->port, data + 4);
+}
+
+static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	if (sset == ETH_SS_STATS) {
+		int count;
+
+		count = 4;
+		if (ds->drv->get_sset_count != NULL)
+			count += ds->drv->get_sset_count(ds);
+
+		return count;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct ethtool_ops dsa_slave_ethtool_ops = {
+	.get_settings		= dsa_slave_get_settings,
+	.set_settings		= dsa_slave_set_settings,
+	.get_drvinfo		= dsa_slave_get_drvinfo,
+	.nway_reset		= dsa_slave_nway_reset,
+	.get_link		= dsa_slave_get_link,
+	.set_sg			= ethtool_op_set_sg,
+	.get_strings		= dsa_slave_get_strings,
+	.get_ethtool_stats	= dsa_slave_get_ethtool_stats,
+	.get_sset_count		= dsa_slave_get_sset_count,
+};
+
+
+/* slave device setup *******************************************************/
+struct net_device *
+dsa_slave_create(struct dsa_switch *ds, struct device *parent,
+		 int port, char *name)
+{
+	struct net_device *master = ds->master_netdev;
+	struct net_device *slave_dev;
+	struct dsa_slave_priv *p;
+	int ret;
+
+	slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv),
+				 name, ether_setup);
+	if (slave_dev == NULL)
+		return slave_dev;
+
+	slave_dev->features = master->vlan_features;
+	SET_ETHTOOL_OPS(slave_dev, &dsa_slave_ethtool_ops);
+	memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
+	slave_dev->tx_queue_len = 0;
+	switch (ds->tag_protocol) {
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+	case htons(ETH_P_EDSA):
+		slave_dev->hard_start_xmit = edsa_xmit;
+		break;
+#endif
+	default:
+		BUG();
+	}
+	slave_dev->open = dsa_slave_open;
+	slave_dev->stop = dsa_slave_close;
+	slave_dev->change_rx_flags = dsa_slave_change_rx_flags;
+	slave_dev->set_rx_mode = dsa_slave_set_rx_mode;
+	slave_dev->set_multicast_list = dsa_slave_set_rx_mode;
+	slave_dev->set_mac_address = dsa_slave_set_mac_address;
+	slave_dev->do_ioctl = dsa_slave_ioctl;
+	SET_NETDEV_DEV(slave_dev, parent);
+	slave_dev->vlan_features = master->vlan_features;
+
+	p = netdev_priv(slave_dev);
+	p->dev = slave_dev;
+	p->parent = ds;
+	p->port = port;
+	p->phy = ds->slave_mii_bus->phy_map[port];
+
+	ret = register_netdev(slave_dev);
+	if (ret) {
+		printk(KERN_ERR "%s: error %d registering interface %s\n",
+				master->name, ret, slave_dev->name);
+		free_netdev(slave_dev);
+		return NULL;
+	}
+
+	netif_carrier_off(slave_dev);
+
+	if (p->phy != NULL) {
+		phy_attach(slave_dev, p->phy->dev.bus_id,
+			   0, PHY_INTERFACE_MODE_GMII);
+
+		p->phy->autoneg = AUTONEG_ENABLE;
+		p->phy->speed = 0;
+		p->phy->duplex = 0;
+		p->phy->advertising = p->phy->supported | ADVERTISED_Autoneg;
+		phy_start_aneg(p->phy);
+	}
+
+	return slave_dev;
+}
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
new file mode 100644
index 00000000000..f985ea99384
--- /dev/null
+++ b/net/dsa/tag_edsa.c
@@ -0,0 +1,213 @@
+/*
+ * net/dsa/tag_edsa.c - Ethertype DSA tagging
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include "dsa_priv.h"
+
+#define DSA_HLEN	4
+#define EDSA_HLEN	8
+
+int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	u8 *edsa_header;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	/*
+	 * Convert the outermost 802.1q tag to a DSA tag and prepend
+	 * a DSA ethertype field is the packet is tagged, or insert
+	 * a DSA ethertype plus DSA tag between the addresses and the
+	 * current ethertype field if the packet is untagged.
+	 */
+	if (skb->protocol == htons(ETH_P_8021Q)) {
+		if (skb_cow_head(skb, DSA_HLEN) < 0)
+			goto out_free;
+		skb_push(skb, DSA_HLEN);
+
+		memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
+
+		/*
+		 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
+		 */
+		edsa_header = skb->data + 2 * ETH_ALEN;
+		edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+		edsa_header[1] = ETH_P_EDSA & 0xff;
+		edsa_header[2] = 0x00;
+		edsa_header[3] = 0x00;
+		edsa_header[4] = 0x60;
+		edsa_header[5] = p->port << 3;
+
+		/*
+		 * Move CFI field from byte 6 to byte 5.
+		 */
+		if (edsa_header[6] & 0x10) {
+			edsa_header[5] |= 0x01;
+			edsa_header[6] &= ~0x10;
+		}
+	} else {
+		if (skb_cow_head(skb, EDSA_HLEN) < 0)
+			goto out_free;
+		skb_push(skb, EDSA_HLEN);
+
+		memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN);
+
+		/*
+		 * Construct untagged FROM_CPU DSA tag.
+		 */
+		edsa_header = skb->data + 2 * ETH_ALEN;
+		edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+		edsa_header[1] = ETH_P_EDSA & 0xff;
+		edsa_header[2] = 0x00;
+		edsa_header[3] = 0x00;
+		edsa_header[4] = 0x40;
+		edsa_header[5] = p->port << 3;
+		edsa_header[6] = 0x00;
+		edsa_header[7] = 0x00;
+	}
+
+	skb->protocol = htons(ETH_P_EDSA);
+
+	skb->dev = p->parent->master_netdev;
+	dev_queue_xmit(skb);
+
+	return NETDEV_TX_OK;
+
+out_free:
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
+		    struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch *ds = dev->dsa_ptr;
+	u8 *edsa_header;
+	int source_port;
+
+	if (unlikely(ds == NULL))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+		goto out_drop;
+
+	/*
+	 * Skip the two null bytes after the ethertype.
+	 */
+	edsa_header = skb->data + 2;
+
+	/*
+	 * Check that frame type is either TO_CPU or FORWARD, and
+	 * that the source device is zero.
+	 */
+	if ((edsa_header[0] & 0xdf) != 0x00 && (edsa_header[0] & 0xdf) != 0xc0)
+		goto out_drop;
+
+	/*
+	 * Check that the source port is a registered DSA port.
+	 */
+	source_port = (edsa_header[1] >> 3) & 0x1f;
+	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+		goto out_drop;
+
+	/*
+	 * If the 'tagged' bit is set, convert the DSA tag to a 802.1q
+	 * tag and delete the ethertype part.  If the 'tagged' bit is
+	 * clear, delete the ethertype and the DSA tag parts.
+	 */
+	if (edsa_header[0] & 0x20) {
+		u8 new_header[4];
+
+		/*
+		 * Insert 802.1q ethertype and copy the VLAN-related
+		 * fields, but clear the bit that will hold CFI (since
+		 * DSA uses that bit location for another purpose).
+		 */
+		new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
+		new_header[1] = ETH_P_8021Q & 0xff;
+		new_header[2] = edsa_header[2] & ~0x10;
+		new_header[3] = edsa_header[3];
+
+		/*
+		 * Move CFI bit from its place in the DSA header to
+		 * its 802.1q-designated place.
+		 */
+		if (edsa_header[1] & 0x01)
+			new_header[2] |= 0x10;
+
+		skb_pull_rcsum(skb, DSA_HLEN);
+
+		/*
+		 * Update packet checksum if skb is CHECKSUM_COMPLETE.
+		 */
+		if (skb->ip_summed == CHECKSUM_COMPLETE) {
+			__wsum c = skb->csum;
+			c = csum_add(c, csum_partial(new_header + 2, 2, 0));
+			c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0));
+			skb->csum = c;
+		}
+
+		memcpy(edsa_header, new_header, DSA_HLEN);
+
+		memmove(skb->data - ETH_HLEN,
+			skb->data - ETH_HLEN - DSA_HLEN,
+			2 * ETH_ALEN);
+	} else {
+		/*
+		 * Remove DSA tag and update checksum.
+		 */
+		skb_pull_rcsum(skb, EDSA_HLEN);
+		memmove(skb->data - ETH_HLEN,
+			skb->data - ETH_HLEN - EDSA_HLEN,
+			2 * ETH_ALEN);
+	}
+
+	skb->dev = ds->ports[source_port];
+	skb_push(skb, ETH_HLEN);
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->last_rx = jiffies;
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+static struct packet_type edsa_packet_type = {
+	.type	= __constant_htons(ETH_P_EDSA),
+	.func	= edsa_rcv,
+};
+
+static int __init edsa_init_module(void)
+{
+	dev_add_pack(&edsa_packet_type);
+	return 0;
+}
+module_init(edsa_init_module);
+
+static void __exit edsa_cleanup_module(void)
+{
+	dev_remove_pack(&edsa_packet_type);
+}
+module_exit(edsa_cleanup_module);
-- 
cgit v1.2.3


From cf85d08fdf4548ee46657ccfb7f9949a85145db5 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Tue, 7 Oct 2008 13:45:02 +0000
Subject: dsa: add support for original DSA tagging format

Most of the DSA switches currently in the field do not support the
Ethertype DSA tagging format that one of the previous patches added
support for, but only the original DSA tagging format.

The original DSA tagging format carries the same information as the
Ethertype DSA tagging format, but with the difference that it does not
have an ethertype field.  In other words, when receiving a packet that
is tagged with an original DSA tag, there is no way of telling in
eth_type_trans() that this packet is in fact a DSA-tagged packet.

This patch adds a hook into eth_type_trans() which is only compiled in
if support for a switch chip that doesn't support Ethertype DSA is
selected, and which checks whether there is a DSA switch driver
instance attached to this network device which uses the old tag format.
If so, it sets the protocol field to ETH_P_DSA without looking at the
packet, so that the packet ends up in the right place.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Nicolas Pitre <nico@marvell.com>
Tested-by: Peter van Valderen <linux@ddcrew.com>
Tested-by: Dirk Teurlings <dirk@upexia.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig           |   4 +
 net/dsa/Makefile          |   1 +
 net/dsa/dsa.c             |  16 ++++
 net/dsa/dsa_priv.h        |   3 +
 net/dsa/mv88e6123_61_65.c |  18 +++--
 net/dsa/slave.c           |   5 ++
 net/dsa/tag_dsa.c         | 194 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ethernet/eth.c        |  10 +++
 8 files changed, 244 insertions(+), 7 deletions(-)
 create mode 100644 net/dsa/tag_dsa.c

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 7cf55e5eb39..6b68016827d 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -10,6 +10,10 @@ menuconfig NET_DSA
 if NET_DSA
 
 # tagging formats
+config NET_DSA_TAG_DSA
+	bool
+	default n
+
 config NET_DSA_TAG_EDSA
 	bool
 	default n
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index b59a6f6bcf5..8b92123315b 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,4 +1,5 @@
 # tagging formats
+obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
 obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
 
 # switch drivers
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 6cc5be2ec7f..f8c549281c3 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -202,6 +202,22 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 }
 
 
+/* hooks for ethertype-less tagging formats *********************************/
+/*
+ * The original DSA tag format and some other tag formats have no
+ * ethertype, which means that we need to add a little hack to the
+ * networking receive path to make sure that received frames get
+ * the right ->protocol assigned to them when one of those tag
+ * formats is in use.
+ */
+bool dsa_uses_dsa_tags(void *dsa_ptr)
+{
+	struct dsa_switch *ds = dsa_ptr;
+
+	return !!(ds->tag_protocol == htons(ETH_P_DSA));
+}
+
+
 /* link polling *************************************************************/
 static void dsa_link_poll_work(struct work_struct *ugly)
 {
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 21ee9052079..2f1d68c495e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -103,6 +103,9 @@ struct net_device *dsa_slave_create(struct dsa_switch *ds,
 				    struct device *parent,
 				    int port, char *name);
 
+/* tag_dsa.c */
+int dsa_xmit(struct sk_buff *skb, struct net_device *dev);
+
 /* tag_edsa.c */
 int edsa_xmit(struct sk_buff *skb, struct net_device *dev);
 
diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c
index 147818cc706..555b164082f 100644
--- a/net/dsa/mv88e6123_61_65.c
+++ b/net/dsa/mv88e6123_61_65.c
@@ -192,15 +192,19 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
 
 	/*
 	 * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock,
-	 * configure the EDSA tagging mode if this is the CPU port,
-	 * disable Header mode, enable IGMP/MLD snooping, disable VLAN
-	 * tunneling, determine priority by looking at 802.1p and IP
-	 * priority fields (IP prio has precedence), and set STP state
-	 * to Forwarding.  Finally, if this is the CPU port, additionally
-	 * enable forwarding of unknown unicast and multicast addresses.
+	 * configure the requested (DSA/EDSA) tagging mode if this is
+	 * the CPU port, disable Header mode, enable IGMP/MLD snooping,
+	 * disable VLAN tunneling, determine priority by looking at
+	 * 802.1p and IP priority fields (IP prio has precedence), and
+	 * set STP state to Forwarding.  Finally, if this is the CPU
+	 * port, additionally enable forwarding of unknown unicast and
+	 * multicast addresses.
 	 */
 	REG_WRITE(addr, 0x04,
-			(p == ds->cpu_port) ? 0x373f : 0x0433);
+			(p == ds->cpu_port) ?
+			 (ds->tag_protocol == htons(ETH_P_DSA)) ?
+			  0x053f : 0x373f :
+			 0x0433);
 
 	/*
 	 * Port Control 1: disable trunking.  Also, if this is the
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3cb331e98b8..8f8868dd430 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -239,6 +239,11 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
 	slave_dev->tx_queue_len = 0;
 	switch (ds->tag_protocol) {
+#ifdef CONFIG_NET_DSA_TAG_DSA
+	case htons(ETH_P_DSA):
+		slave_dev->hard_start_xmit = dsa_xmit;
+		break;
+#endif
 #ifdef CONFIG_NET_DSA_TAG_EDSA
 	case htons(ETH_P_EDSA):
 		slave_dev->hard_start_xmit = edsa_xmit;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
new file mode 100644
index 00000000000..bdc0510b53b
--- /dev/null
+++ b/net/dsa/tag_dsa.c
@@ -0,0 +1,194 @@
+/*
+ * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include "dsa_priv.h"
+
+#define DSA_HLEN	4
+
+int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	u8 *dsa_header;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	/*
+	 * Convert the outermost 802.1q tag to a DSA tag for tagged
+	 * packets, or insert a DSA tag between the addresses and
+	 * the ethertype field for untagged packets.
+	 */
+	if (skb->protocol == htons(ETH_P_8021Q)) {
+		if (skb_cow_head(skb, 0) < 0)
+			goto out_free;
+
+		/*
+		 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
+		 */
+		dsa_header = skb->data + 2 * ETH_ALEN;
+		dsa_header[0] = 0x60;
+		dsa_header[1] = p->port << 3;
+
+		/*
+		 * Move CFI field from byte 2 to byte 1.
+		 */
+		if (dsa_header[2] & 0x10) {
+			dsa_header[1] |= 0x01;
+			dsa_header[2] &= ~0x10;
+		}
+	} else {
+		if (skb_cow_head(skb, DSA_HLEN) < 0)
+			goto out_free;
+		skb_push(skb, DSA_HLEN);
+
+		memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
+
+		/*
+		 * Construct untagged FROM_CPU DSA tag.
+		 */
+		dsa_header = skb->data + 2 * ETH_ALEN;
+		dsa_header[0] = 0x40;
+		dsa_header[1] = p->port << 3;
+		dsa_header[2] = 0x00;
+		dsa_header[3] = 0x00;
+	}
+
+	skb->protocol = htons(ETH_P_DSA);
+
+	skb->dev = p->parent->master_netdev;
+	dev_queue_xmit(skb);
+
+	return NETDEV_TX_OK;
+
+out_free:
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
+		   struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch *ds = dev->dsa_ptr;
+	u8 *dsa_header;
+	int source_port;
+
+	if (unlikely(ds == NULL))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+		goto out_drop;
+
+	/*
+	 * The ethertype field is part of the DSA header.
+	 */
+	dsa_header = skb->data - 2;
+
+	/*
+	 * Check that frame type is either TO_CPU or FORWARD, and
+	 * that the source device is zero.
+	 */
+	if ((dsa_header[0] & 0xdf) != 0x00 && (dsa_header[0] & 0xdf) != 0xc0)
+		goto out_drop;
+
+	/*
+	 * Check that the source port is a registered DSA port.
+	 */
+	source_port = (dsa_header[1] >> 3) & 0x1f;
+	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+		goto out_drop;
+
+	/*
+	 * Convert the DSA header to an 802.1q header if the 'tagged'
+	 * bit in the DSA header is set.  If the 'tagged' bit is clear,
+	 * delete the DSA header entirely.
+	 */
+	if (dsa_header[0] & 0x20) {
+		u8 new_header[4];
+
+		/*
+		 * Insert 802.1q ethertype and copy the VLAN-related
+		 * fields, but clear the bit that will hold CFI (since
+		 * DSA uses that bit location for another purpose).
+		 */
+		new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
+		new_header[1] = ETH_P_8021Q & 0xff;
+		new_header[2] = dsa_header[2] & ~0x10;
+		new_header[3] = dsa_header[3];
+
+		/*
+		 * Move CFI bit from its place in the DSA header to
+		 * its 802.1q-designated place.
+		 */
+		if (dsa_header[1] & 0x01)
+			new_header[2] |= 0x10;
+
+		/*
+		 * Update packet checksum if skb is CHECKSUM_COMPLETE.
+		 */
+		if (skb->ip_summed == CHECKSUM_COMPLETE) {
+			__wsum c = skb->csum;
+			c = csum_add(c, csum_partial(new_header + 2, 2, 0));
+			c = csum_sub(c, csum_partial(dsa_header + 2, 2, 0));
+			skb->csum = c;
+		}
+
+		memcpy(dsa_header, new_header, DSA_HLEN);
+	} else {
+		/*
+		 * Remove DSA tag and update checksum.
+		 */
+		skb_pull_rcsum(skb, DSA_HLEN);
+		memmove(skb->data - ETH_HLEN,
+			skb->data - ETH_HLEN - DSA_HLEN,
+			2 * ETH_ALEN);
+	}
+
+	skb->dev = ds->ports[source_port];
+	skb_push(skb, ETH_HLEN);
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->last_rx = jiffies;
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+static struct packet_type dsa_packet_type = {
+	.type	= __constant_htons(ETH_P_DSA),
+	.func	= dsa_rcv,
+};
+
+static int __init dsa_init_module(void)
+{
+	dev_add_pack(&dsa_packet_type);
+	return 0;
+}
+module_init(dsa_init_module);
+
+static void __exit dsa_cleanup_module(void)
+{
+	dev_remove_pack(&dsa_packet_type);
+}
+module_exit(dsa_cleanup_module);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 647a9edee37..dae47e7a44d 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -57,6 +57,7 @@
 #include <net/sock.h>
 #include <net/ipv6.h>
 #include <net/ip.h>
+#include <net/dsa.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
@@ -184,6 +185,15 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 			skb->pkt_type = PACKET_OTHERHOST;
 	}
 
+	/*
+	 * Some variants of DSA tagging don't have an ethertype field
+	 * at all, so we check here whether one of those tagging
+	 * variants has been configured on the receiving interface,
+	 * and if so, set skb->protocol without looking at the packet.
+	 */
+	if (netdev_uses_dsa_tags(dev))
+		return htons(ETH_P_DSA);
+
 	if (ntohs(eth->h_proto) >= 1536)
 		return eth->h_proto;
 
-- 
cgit v1.2.3


From 2e5f032095ff101274dfb03d5fd5e06d9aeb83cd Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Tue, 7 Oct 2008 13:45:18 +0000
Subject: dsa: add support for the Marvell 88E6131 switch chip

Add support for the Marvell 88E6131 switch chip.  This chip only
supports the original (ethertype-less) DSA tagging format.

On the 88E6131, there is a PHY Polling Unit (PPU) which has exclusive
access to each of the PHYs's MII management registers.  If we want to
talk to the PHYs from software, we have to disable the PPU and wait
for it to complete its current transaction before we can do so, and we
need to re-enable the PPU afterwards to make sure that the switch will
notice changes in link state and speed on the individual ports as they
occur.

Since disabling the PPU is rather slow, and since MII management
accesses are typically done in bursts, this patch keeps the PPU disabled
for 10ms after a software access completes.  This makes handling the
PPU slightly more complex, but speeds up something like running ethtool
on one of the switch slave interfaces from ~300ms to ~30ms on typical
hardware.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Nicolas Pitre <nico@marvell.com>
Tested-by: Peter van Valderen <linux@ddcrew.com>
Tested-by: Dirk Teurlings <dirk@upexia.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig     |  13 ++
 net/dsa/Makefile    |   1 +
 net/dsa/mv88e6131.c | 380 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/dsa/mv88e6xxx.c | 145 ++++++++++++++++++++
 net/dsa/mv88e6xxx.h |  16 +++
 5 files changed, 555 insertions(+)
 create mode 100644 net/dsa/mv88e6131.c

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 6b68016827d..79bcd76d3f1 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -24,6 +24,19 @@ config NET_DSA_MV88E6XXX
 	bool
 	default n
 
+config NET_DSA_MV88E6XXX_NEED_PPU
+	bool
+	default n
+
+config NET_DSA_MV88E6131
+	bool "Marvell 88E6131 ethernet switch chip support"
+	select NET_DSA_MV88E6XXX
+	select NET_DSA_MV88E6XXX_NEED_PPU
+	select NET_DSA_TAG_DSA
+	---help---
+	  This enables support for the Marvell 88E6131 ethernet switch
+	  chip.
+
 config NET_DSA_MV88E6123_61_65
 	bool "Marvell 88E6123/6161/6165 ethernet switch chip support"
 	select NET_DSA_MV88E6XXX
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 8b92123315b..7fb6f85a69e 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
 # switch drivers
 obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
 obj-$(CONFIG_NET_DSA_MV88E6123_61_65) += mv88e6123_61_65.o
+obj-$(CONFIG_NET_DSA_MV88E6131) += mv88e6131.o
 
 # the core
 obj-$(CONFIG_NET_DSA) += dsa.o slave.o
diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c
new file mode 100644
index 00000000000..36e01eb863a
--- /dev/null
+++ b/net/dsa/mv88e6131.c
@@ -0,0 +1,380 @@
+/*
+ * net/dsa/mv88e6131.c - Marvell 88e6131 switch chip support
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+#include "mv88e6xxx.h"
+
+static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+
+	ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03);
+	if (ret >= 0) {
+		ret &= 0xfff0;
+		if (ret == 0x1060)
+			return "Marvell 88E6131";
+	}
+
+	return NULL;
+}
+
+static int mv88e6131_switch_reset(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Set all ports to the disabled state.
+	 */
+	for (i = 0; i < 8; i++) {
+		ret = REG_READ(REG_PORT(i), 0x04);
+		REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
+	}
+
+	/*
+	 * Wait for transmit queues to drain.
+	 */
+	msleep(2);
+
+	/*
+	 * Reset the switch.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0xc400);
+
+	/*
+	 * Wait up to one second for reset to complete.
+	 */
+	for (i = 0; i < 1000; i++) {
+		ret = REG_READ(REG_GLOBAL, 0x00);
+		if ((ret & 0xc800) == 0xc800)
+			break;
+
+		msleep(1);
+	}
+	if (i == 1000)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int mv88e6131_setup_global(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	/*
+	 * Enable the PHY polling unit, don't discard packets with
+	 * excessive collisions, use a weighted fair queueing scheme
+	 * to arbitrate between packet queues, set the maximum frame
+	 * size to 1632, and mask all interrupt sources.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0x4400);
+
+	/*
+	 * Set the default address aging time to 5 minutes, and
+	 * enable address learn messages to be sent to all message
+	 * ports.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0a, 0x0148);
+
+	/*
+	 * Configure the priority mapping registers.
+	 */
+	ret = mv88e6xxx_config_prio(ds);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Set the VLAN ethertype to 0x8100.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x19, 0x8100);
+
+	/*
+	 * Disable ARP mirroring, and configure the cpu port as the
+	 * port to which ingress and egress monitor frames are to be
+	 * sent.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1100) | 0x00f0);
+
+	/*
+	 * Disable cascade port functionality, and set the switch's
+	 * DSA device number to zero.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1c, 0xe000);
+
+	/*
+	 * Send all frames with destination addresses matching
+	 * 01:80:c2:00:00:0x to the CPU port.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x03, 0xffff);
+
+	/*
+	 * Ignore removed tag data on doubly tagged packets, disable
+	 * flow control messages, force flow control priority to the
+	 * highest, and send all special multicast frames to the CPU
+	 * port at the higest priority.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
+
+	/*
+	 * Map all DSA device IDs to the CPU port.
+	 */
+	for (i = 0; i < 32; i++)
+		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port);
+
+	/*
+	 * Clear all trunk masks.
+	 */
+	for (i = 0; i < 8; i++)
+		REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff);
+
+	/*
+	 * Clear all trunk mappings.
+	 */
+	for (i = 0; i < 16; i++)
+		REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11));
+
+	/*
+	 * Force the priority of IGMP/MLD snoop frames and ARP frames
+	 * to the highest setting.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x0f, 0x00ff);
+
+	return 0;
+}
+
+static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
+{
+	int addr = REG_PORT(p);
+
+	/*
+	 * MAC Forcing register: don't force link, speed, duplex
+	 * or flow control state to any particular values.
+	 */
+	REG_WRITE(addr, 0x01, 0x0003);
+
+	/*
+	 * Port Control: disable Core Tag, disable Drop-on-Lock,
+	 * transmit frames unmodified, disable Header mode,
+	 * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN
+	 * tunneling, determine priority by looking at 802.1p and
+	 * IP priority fields (IP prio has precedence), and set STP
+	 * state to Forwarding.  Finally, if this is the CPU port,
+	 * additionally enable DSA tagging and forwarding of unknown
+	 * unicast addresses.
+	 */
+	REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x0537 : 0x0433);
+
+	/*
+	 * Port Control 1: disable trunking.  Also, if this is the
+	 * CPU port, enable learn messages to be sent to this port.
+	 */
+	REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000);
+
+	/*
+	 * Port based VLAN map: give each port its own address
+	 * database, allow the CPU port to talk to each of the 'real'
+	 * ports, and allow each of the 'real' ports to only talk to
+	 * the CPU port.
+	 */
+	REG_WRITE(addr, 0x06,
+			((p & 0xf) << 12) |
+			 ((p == ds->cpu_port) ?
+				ds->valid_port_mask :
+				(1 << ds->cpu_port)));
+
+	/*
+	 * Default VLAN ID and priority: don't set a default VLAN
+	 * ID, and set the default packet priority to zero.
+	 */
+	REG_WRITE(addr, 0x07, 0x0000);
+
+	/*
+	 * Port Control 2: don't force a good FCS, don't use
+	 * VLAN-based, source address-based or destination
+	 * address-based priority overrides, don't let the switch
+	 * add or strip 802.1q tags, don't discard tagged or
+	 * untagged frames on this port, do a destination address
+	 * lookup on received packets as usual, don't send a copy
+	 * of all transmitted/received frames on this port to the
+	 * CPU, and configure the CPU port number.  Also, if this
+	 * is the CPU port, enable forwarding of unknown multicast
+	 * addresses.
+	 */
+	REG_WRITE(addr, 0x08,
+			((p == ds->cpu_port) ? 0x00c0 : 0x0080) |
+			 ds->cpu_port);
+
+	/*
+	 * Rate Control: disable ingress rate limiting.
+	 */
+	REG_WRITE(addr, 0x09, 0x0000);
+
+	/*
+	 * Rate Control 2: disable egress rate limiting.
+	 */
+	REG_WRITE(addr, 0x0a, 0x0000);
+
+	/*
+	 * Port Association Vector: when learning source addresses
+	 * of packets, add the address to the address database using
+	 * a port bitmap that has only the bit for this port set and
+	 * the other bits clear.
+	 */
+	REG_WRITE(addr, 0x0b, 1 << p);
+
+	/*
+	 * Tag Remap: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x18, 0x3210);
+
+	/*
+	 * Tag Remap 2: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x19, 0x7654);
+
+	return 0;
+}
+
+static int mv88e6131_setup(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int i;
+	int ret;
+
+	mutex_init(&ps->smi_mutex);
+	mv88e6xxx_ppu_state_init(ds);
+	mutex_init(&ps->stats_mutex);
+
+	ret = mv88e6131_switch_reset(ds);
+	if (ret < 0)
+		return ret;
+
+	/* @@@ initialise vtu and atu */
+
+	ret = mv88e6131_setup_global(ds);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < 6; i++) {
+		ret = mv88e6131_setup_port(ds, i);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mv88e6131_port_to_phy_addr(int port)
+{
+	if (port >= 0 && port != 3 && port <= 7)
+		return port;
+	return -1;
+}
+
+static int
+mv88e6131_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+	int addr = mv88e6131_port_to_phy_addr(port);
+	return mv88e6xxx_phy_read_ppu(ds, addr, regnum);
+}
+
+static int
+mv88e6131_phy_write(struct dsa_switch *ds,
+			      int port, int regnum, u16 val)
+{
+	int addr = mv88e6131_port_to_phy_addr(port);
+	return mv88e6xxx_phy_write_ppu(ds, addr, regnum, val);
+}
+
+static struct mv88e6xxx_hw_stat mv88e6131_hw_stats[] = {
+	{ "in_good_octets", 8, 0x00, },
+	{ "in_bad_octets", 4, 0x02, },
+	{ "in_unicast", 4, 0x04, },
+	{ "in_broadcasts", 4, 0x06, },
+	{ "in_multicasts", 4, 0x07, },
+	{ "in_pause", 4, 0x16, },
+	{ "in_undersize", 4, 0x18, },
+	{ "in_fragments", 4, 0x19, },
+	{ "in_oversize", 4, 0x1a, },
+	{ "in_jabber", 4, 0x1b, },
+	{ "in_rx_error", 4, 0x1c, },
+	{ "in_fcs_error", 4, 0x1d, },
+	{ "out_octets", 8, 0x0e, },
+	{ "out_unicast", 4, 0x10, },
+	{ "out_broadcasts", 4, 0x13, },
+	{ "out_multicasts", 4, 0x12, },
+	{ "out_pause", 4, 0x15, },
+	{ "excessive", 4, 0x11, },
+	{ "collisions", 4, 0x1e, },
+	{ "deferred", 4, 0x05, },
+	{ "single", 4, 0x14, },
+	{ "multiple", 4, 0x17, },
+	{ "out_fcs_error", 4, 0x03, },
+	{ "late", 4, 0x1f, },
+	{ "hist_64bytes", 4, 0x08, },
+	{ "hist_65_127bytes", 4, 0x09, },
+	{ "hist_128_255bytes", 4, 0x0a, },
+	{ "hist_256_511bytes", 4, 0x0b, },
+	{ "hist_512_1023bytes", 4, 0x0c, },
+	{ "hist_1024_max_bytes", 4, 0x0d, },
+};
+
+static void
+mv88e6131_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+{
+	mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6131_hw_stats),
+			      mv88e6131_hw_stats, port, data);
+}
+
+static void
+mv88e6131_get_ethtool_stats(struct dsa_switch *ds,
+				  int port, uint64_t *data)
+{
+	mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6131_hw_stats),
+				    mv88e6131_hw_stats, port, data);
+}
+
+static int mv88e6131_get_sset_count(struct dsa_switch *ds)
+{
+	return ARRAY_SIZE(mv88e6131_hw_stats);
+}
+
+static struct dsa_switch_driver mv88e6131_switch_driver = {
+	.tag_protocol		= __constant_htons(ETH_P_DSA),
+	.priv_size		= sizeof(struct mv88e6xxx_priv_state),
+	.probe			= mv88e6131_probe,
+	.setup			= mv88e6131_setup,
+	.set_addr		= mv88e6xxx_set_addr_direct,
+	.phy_read		= mv88e6131_phy_read,
+	.phy_write		= mv88e6131_phy_write,
+	.poll_link		= mv88e6xxx_poll_link,
+	.get_strings		= mv88e6131_get_strings,
+	.get_ethtool_stats	= mv88e6131_get_ethtool_stats,
+	.get_sset_count		= mv88e6131_get_sset_count,
+};
+
+int __init mv88e6131_init(void)
+{
+	register_switch_driver(&mv88e6131_switch_driver);
+	return 0;
+}
+module_init(mv88e6131_init);
+
+void __exit mv88e6131_cleanup(void)
+{
+	unregister_switch_driver(&mv88e6131_switch_driver);
+}
+module_exit(mv88e6131_cleanup);
diff --git a/net/dsa/mv88e6xxx.c b/net/dsa/mv88e6xxx.c
index 13d2328a240..aa6c609c59f 100644
--- a/net/dsa/mv88e6xxx.c
+++ b/net/dsa/mv88e6xxx.c
@@ -165,6 +165,15 @@ int mv88e6xxx_config_prio(struct dsa_switch *ds)
 	return 0;
 }
 
+int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr)
+{
+	REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]);
+	REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]);
+	REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]);
+
+	return 0;
+}
+
 int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr)
 {
 	int i;
@@ -207,6 +216,142 @@ int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val)
 	return 0;
 }
 
+#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU
+static int mv88e6xxx_ppu_disable(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	ret = REG_READ(REG_GLOBAL, 0x04);
+	REG_WRITE(REG_GLOBAL, 0x04, ret & ~0x4000);
+
+	for (i = 0; i < 1000; i++) {
+	        ret = REG_READ(REG_GLOBAL, 0x00);
+	        msleep(1);
+	        if ((ret & 0xc000) != 0xc000)
+	                return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int mv88e6xxx_ppu_enable(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	ret = REG_READ(REG_GLOBAL, 0x04);
+	REG_WRITE(REG_GLOBAL, 0x04, ret | 0x4000);
+
+	for (i = 0; i < 1000; i++) {
+	        ret = REG_READ(REG_GLOBAL, 0x00);
+	        msleep(1);
+	        if ((ret & 0xc000) == 0xc000)
+	                return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+static void mv88e6xxx_ppu_reenable_work(struct work_struct *ugly)
+{
+	struct mv88e6xxx_priv_state *ps;
+
+	ps = container_of(ugly, struct mv88e6xxx_priv_state, ppu_work);
+	if (mutex_trylock(&ps->ppu_mutex)) {
+	        struct dsa_switch *ds = ((struct dsa_switch *)ps) - 1;
+
+	        if (mv88e6xxx_ppu_enable(ds) == 0)
+	                ps->ppu_disabled = 0;
+	        mutex_unlock(&ps->ppu_mutex);
+	}
+}
+
+static void mv88e6xxx_ppu_reenable_timer(unsigned long _ps)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)_ps;
+
+	schedule_work(&ps->ppu_work);
+}
+
+static int mv88e6xxx_ppu_access_get(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+
+	mutex_lock(&ps->ppu_mutex);
+
+	/*
+	 * If the PHY polling unit is enabled, disable it so that
+	 * we can access the PHY registers.  If it was already
+	 * disabled, cancel the timer that is going to re-enable
+	 * it.
+	 */
+	if (!ps->ppu_disabled) {
+	        ret = mv88e6xxx_ppu_disable(ds);
+	        if (ret < 0) {
+	                mutex_unlock(&ps->ppu_mutex);
+	                return ret;
+	        }
+	        ps->ppu_disabled = 1;
+	} else {
+	        del_timer(&ps->ppu_timer);
+	        ret = 0;
+	}
+
+	return ret;
+}
+
+static void mv88e6xxx_ppu_access_put(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+
+	/*
+	 * Schedule a timer to re-enable the PHY polling unit.
+	 */
+	mod_timer(&ps->ppu_timer, jiffies + msecs_to_jiffies(10));
+	mutex_unlock(&ps->ppu_mutex);
+}
+
+void mv88e6xxx_ppu_state_init(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+
+	mutex_init(&ps->ppu_mutex);
+	INIT_WORK(&ps->ppu_work, mv88e6xxx_ppu_reenable_work);
+	init_timer(&ps->ppu_timer);
+	ps->ppu_timer.data = (unsigned long)ps;
+	ps->ppu_timer.function = mv88e6xxx_ppu_reenable_timer;
+}
+
+int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum)
+{
+	int ret;
+
+	ret = mv88e6xxx_ppu_access_get(ds);
+	if (ret >= 0) {
+	        ret = mv88e6xxx_reg_read(ds, addr, regnum);
+	        mv88e6xxx_ppu_access_put(ds);
+	}
+
+	return ret;
+}
+
+int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr,
+			    int regnum, u16 val)
+{
+	int ret;
+
+	ret = mv88e6xxx_ppu_access_get(ds);
+	if (ret >= 0) {
+	        ret = mv88e6xxx_reg_write(ds, addr, regnum, val);
+	        mv88e6xxx_ppu_access_put(ds);
+	}
+
+	return ret;
+}
+#endif
+
 void mv88e6xxx_poll_link(struct dsa_switch *ds)
 {
 	int i;
diff --git a/net/dsa/mv88e6xxx.h b/net/dsa/mv88e6xxx.h
index a004d4d0208..eb0e0aaa9f1 100644
--- a/net/dsa/mv88e6xxx.h
+++ b/net/dsa/mv88e6xxx.h
@@ -23,6 +23,17 @@ struct mv88e6xxx_priv_state {
 	 */
 	struct mutex	smi_mutex;
 
+#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU
+	/*
+	 * Handles automatic disabling and re-enabling of the PHY
+	 * polling unit.
+	 */
+	struct mutex		ppu_mutex;
+	int			ppu_disabled;
+	struct work_struct	ppu_work;
+	struct timer_list	ppu_timer;
+#endif
+
 	/*
 	 * This mutex serialises access to the statistics unit.
 	 * Hold this mutex over snapshot + dump sequences.
@@ -42,9 +53,14 @@ int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr,
                           int reg, u16 val);
 int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val);
 int mv88e6xxx_config_prio(struct dsa_switch *ds);
+int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr);
 int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr);
 int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum);
 int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val);
+void mv88e6xxx_ppu_state_init(struct dsa_switch *ds);
+int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum);
+int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr,
+			    int regnum, u16 val);
 void mv88e6xxx_poll_link(struct dsa_switch *ds);
 void mv88e6xxx_get_strings(struct dsa_switch *ds,
 			   int nr_stats, struct mv88e6xxx_hw_stat *stats,
-- 
cgit v1.2.3


From 396138f03f4521c55ecc3a5dd75d4c56e6323244 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Tue, 7 Oct 2008 13:46:07 +0000
Subject: dsa: add support for Trailer tagging format

This adds support for the Trailer switch tagging format.  This is
another tagging that doesn't explicitly mark tagged packets with a
distinct ethertype, so that we need to add a similar hack in the
receive path as for the Original DSA tagging format.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Byron Bradley <byron.bbradley@gmail.com>
Tested-by: Tim Ellis <tim.ellis@mac.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig       |   4 ++
 net/dsa/Makefile      |   1 +
 net/dsa/dsa.c         |   7 +++
 net/dsa/dsa_priv.h    |   3 ++
 net/dsa/slave.c       |   5 ++
 net/dsa/tag_trailer.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ethernet/eth.c    |   2 +
 7 files changed, 152 insertions(+)
 create mode 100644 net/dsa/tag_trailer.c

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 79bcd76d3f1..505aa14e67f 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -18,6 +18,10 @@ config NET_DSA_TAG_EDSA
 	bool
 	default n
 
+config NET_DSA_TAG_TRAILER
+	bool
+	default n
+
 
 # switch drivers
 config NET_DSA_MV88E6XXX
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 7fb6f85a69e..63d3c44908b 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,7 @@
 # tagging formats
 obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
 obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
+obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
 
 # switch drivers
 obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index f8c549281c3..33e99462023 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -217,6 +217,13 @@ bool dsa_uses_dsa_tags(void *dsa_ptr)
 	return !!(ds->tag_protocol == htons(ETH_P_DSA));
 }
 
+bool dsa_uses_trailer_tags(void *dsa_ptr)
+{
+	struct dsa_switch *ds = dsa_ptr;
+
+	return !!(ds->tag_protocol == htons(ETH_P_TRAILER));
+}
+
 
 /* link polling *************************************************************/
 static void dsa_link_poll_work(struct work_struct *ugly)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 2f1d68c495e..7063378a1eb 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -109,5 +109,8 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev);
 /* tag_edsa.c */
 int edsa_xmit(struct sk_buff *skb, struct net_device *dev);
 
+/* tag_trailer.c */
+int trailer_xmit(struct sk_buff *skb, struct net_device *dev);
+
 
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8f8868dd430..37616884b8a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -248,6 +248,11 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	case htons(ETH_P_EDSA):
 		slave_dev->hard_start_xmit = edsa_xmit;
 		break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+	case htons(ETH_P_TRAILER):
+		slave_dev->hard_start_xmit = trailer_xmit;
+		break;
 #endif
 	default:
 		BUG();
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
new file mode 100644
index 00000000000..d3117764b2c
--- /dev/null
+++ b/net/dsa/tag_trailer.c
@@ -0,0 +1,130 @@
+/*
+ * net/dsa/tag_trailer.c - Trailer tag format handling
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include "dsa_priv.h"
+
+int trailer_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct sk_buff *nskb;
+	int padlen;
+	u8 *trailer;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	/*
+	 * We have to make sure that the trailer ends up as the very
+	 * last 4 bytes of the packet.  This means that we have to pad
+	 * the packet to the minimum ethernet frame size, if necessary,
+	 * before adding the trailer.
+	 */
+	padlen = 0;
+	if (skb->len < 60)
+		padlen = 60 - skb->len;
+
+	nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC);
+	if (nskb == NULL) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+	skb_reserve(nskb, NET_IP_ALIGN);
+
+	skb_reset_mac_header(nskb);
+	skb_set_network_header(nskb, skb_network_header(skb) - skb->head);
+	skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head);
+	skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
+	kfree_skb(skb);
+
+	if (padlen) {
+		u8 *pad = skb_put(nskb, padlen);
+		memset(pad, 0, padlen);
+	}
+
+	trailer = skb_put(nskb, 4);
+	trailer[0] = 0x80;
+	trailer[1] = 1 << p->port;
+	trailer[2] = 0x10;
+	trailer[3] = 0x00;
+
+	nskb->protocol = htons(ETH_P_TRAILER);
+
+	nskb->dev = p->parent->master_netdev;
+	dev_queue_xmit(nskb);
+
+	return NETDEV_TX_OK;
+}
+
+static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch *ds = dev->dsa_ptr;
+	u8 *trailer;
+	int source_port;
+
+	if (unlikely(ds == NULL))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out;
+
+	if (skb_linearize(skb))
+		goto out_drop;
+
+	trailer = skb_tail_pointer(skb) - 4;
+	if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 ||
+	    (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00)
+		goto out_drop;
+
+	source_port = trailer[1] & 7;
+	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+		goto out_drop;
+
+	pskb_trim_rcsum(skb, skb->len - 4);
+
+	skb->dev = ds->ports[source_port];
+	skb_push(skb, ETH_HLEN);
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->last_rx = jiffies;
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+static struct packet_type trailer_packet_type = {
+	.type	= __constant_htons(ETH_P_TRAILER),
+	.func	= trailer_rcv,
+};
+
+static int __init trailer_init_module(void)
+{
+	dev_add_pack(&trailer_packet_type);
+	return 0;
+}
+module_init(trailer_init_module);
+
+static void __exit trailer_cleanup_module(void)
+{
+	dev_remove_pack(&trailer_packet_type);
+}
+module_exit(trailer_cleanup_module);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index dae47e7a44d..b9d85af2dd3 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -193,6 +193,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	 */
 	if (netdev_uses_dsa_tags(dev))
 		return htons(ETH_P_DSA);
+	if (netdev_uses_trailer_tags(dev))
+		return htons(ETH_P_TRAILER);
 
 	if (ntohs(eth->h_proto) >= 1536)
 		return eth->h_proto;
-- 
cgit v1.2.3


From 2e16a77e1e674644b4fe552daa1fb11e32398ae6 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Tue, 7 Oct 2008 13:46:22 +0000
Subject: dsa: add support for the Marvell 88E6060 switch chip

Add support for the Marvell 88E6060 switch chip.  This chip only
supports the Header and Trailer tagging formats, and we use it in
Trailer mode since that mode is slightly easier to handle than
Header mode.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Byron Bradley <byron.bbradley@gmail.com>
Tested-by: Tim Ellis <tim.ellis@mac.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig     |   7 ++
 net/dsa/Makefile    |   1 +
 net/dsa/mv88e6060.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 295 insertions(+)
 create mode 100644 net/dsa/mv88e6060.c

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 505aa14e67f..3f2fd397926 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -28,6 +28,13 @@ config NET_DSA_MV88E6XXX
 	bool
 	default n
 
+config NET_DSA_MV88E6060
+	bool "Marvell 88E6060 ethernet switch chip support"
+	select NET_DSA_TAG_TRAILER
+	---help---
+	  This enables support for the Marvell 88E6060 ethernet switch
+	  chip.
+
 config NET_DSA_MV88E6XXX_NEED_PPU
 	bool
 	default n
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 63d3c44908b..2374faff4de 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
 
 # switch drivers
 obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
+obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o
 obj-$(CONFIG_NET_DSA_MV88E6123_61_65) += mv88e6123_61_65.o
 obj-$(CONFIG_NET_DSA_MV88E6131) += mv88e6131.o
 
diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c
new file mode 100644
index 00000000000..54068ef251e
--- /dev/null
+++ b/net/dsa/mv88e6060.c
@@ -0,0 +1,287 @@
+/*
+ * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+
+#define REG_PORT(p)		(8 + (p))
+#define REG_GLOBAL		0x0f
+
+static int reg_read(struct dsa_switch *ds, int addr, int reg)
+{
+	return mdiobus_read(ds->master_mii_bus, addr, reg);
+}
+
+#define REG_READ(addr, reg)					\
+	({							\
+		int __ret;					\
+								\
+		__ret = reg_read(ds, addr, reg);		\
+		if (__ret < 0)					\
+			return __ret;				\
+		__ret;						\
+	})
+
+
+static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val)
+{
+	return mdiobus_write(ds->master_mii_bus, addr, reg, val);
+}
+
+#define REG_WRITE(addr, reg, val)				\
+	({							\
+		int __ret;					\
+								\
+		__ret = reg_write(ds, addr, reg, val);		\
+		if (__ret < 0)					\
+			return __ret;				\
+	})
+
+static char *mv88e6060_probe(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+
+	ret = mdiobus_read(bus, REG_PORT(0), 0x03);
+	if (ret >= 0) {
+		ret &= 0xfff0;
+		if (ret == 0x0600)
+			return "Marvell 88E6060";
+	}
+
+	return NULL;
+}
+
+static int mv88e6060_switch_reset(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Set all ports to the disabled state.
+	 */
+	for (i = 0; i < 6; i++) {
+		ret = REG_READ(REG_PORT(i), 0x04);
+		REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
+	}
+
+	/*
+	 * Wait for transmit queues to drain.
+	 */
+	msleep(2);
+
+	/*
+	 * Reset the switch.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0A, 0xa130);
+
+	/*
+	 * Wait up to one second for reset to complete.
+	 */
+	for (i = 0; i < 1000; i++) {
+		ret = REG_READ(REG_GLOBAL, 0x00);
+		if ((ret & 0x8000) == 0x0000)
+			break;
+
+		msleep(1);
+	}
+	if (i == 1000)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int mv88e6060_setup_global(struct dsa_switch *ds)
+{
+	/*
+	 * Disable discarding of frames with excessive collisions,
+	 * set the maximum frame size to 1536 bytes, and mask all
+	 * interrupt sources.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0x0800);
+
+	/*
+	 * Enable automatic address learning, set the address
+	 * database size to 1024 entries, and set the default aging
+	 * time to 5 minutes.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0a, 0x2130);
+
+	return 0;
+}
+
+static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
+{
+	int addr = REG_PORT(p);
+
+	/*
+	 * Do not force flow control, disable Ingress and Egress
+	 * Header tagging, disable VLAN tunneling, and set the port
+	 * state to Forwarding.  Additionally, if this is the CPU
+	 * port, enable Ingress and Egress Trailer tagging mode.
+	 */
+	REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x4103 : 0x0003);
+
+	/*
+	 * Port based VLAN map: give each port its own address
+	 * database, allow the CPU port to talk to each of the 'real'
+	 * ports, and allow each of the 'real' ports to only talk to
+	 * the CPU port.
+	 */
+	REG_WRITE(addr, 0x06,
+			((p & 0xf) << 12) |
+			 ((p == ds->cpu_port) ?
+				ds->valid_port_mask :
+				(1 << ds->cpu_port)));
+
+	/*
+	 * Port Association Vector: when learning source addresses
+	 * of packets, add the address to the address database using
+	 * a port bitmap that has only the bit for this port set and
+	 * the other bits clear.
+	 */
+	REG_WRITE(addr, 0x0b, 1 << p);
+
+	return 0;
+}
+
+static int mv88e6060_setup(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	ret = mv88e6060_switch_reset(ds);
+	if (ret < 0)
+		return ret;
+
+	/* @@@ initialise atu */
+
+	ret = mv88e6060_setup_global(ds);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < 6; i++) {
+		ret = mv88e6060_setup_port(ds, i);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mv88e6060_set_addr(struct dsa_switch *ds, u8 *addr)
+{
+	REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]);
+	REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]);
+	REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]);
+
+	return 0;
+}
+
+static int mv88e6060_port_to_phy_addr(int port)
+{
+	if (port >= 0 && port <= 5)
+		return port;
+	return -1;
+}
+
+static int mv88e6060_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+	int addr;
+
+	addr = mv88e6060_port_to_phy_addr(port);
+	if (addr == -1)
+		return 0xffff;
+
+	return reg_read(ds, addr, regnum);
+}
+
+static int
+mv88e6060_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val)
+{
+	int addr;
+
+	addr = mv88e6060_port_to_phy_addr(port);
+	if (addr == -1)
+		return 0xffff;
+
+	return reg_write(ds, addr, regnum, val);
+}
+
+static void mv88e6060_poll_link(struct dsa_switch *ds)
+{
+	int i;
+
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		struct net_device *dev;
+		int port_status;
+		int link;
+		int speed;
+		int duplex;
+		int fc;
+
+		dev = ds->ports[i];
+		if (dev == NULL)
+			continue;
+
+		link = 0;
+		if (dev->flags & IFF_UP) {
+			port_status = reg_read(ds, REG_PORT(i), 0x00);
+			if (port_status < 0)
+				continue;
+
+			link = !!(port_status & 0x1000);
+		}
+
+		if (!link) {
+			if (netif_carrier_ok(dev)) {
+				printk(KERN_INFO "%s: link down\n", dev->name);
+				netif_carrier_off(dev);
+			}
+			continue;
+		}
+
+		speed = (port_status & 0x0100) ? 100 : 10;
+		duplex = (port_status & 0x0200) ? 1 : 0;
+		fc = ((port_status & 0xc000) == 0xc000) ? 1 : 0;
+
+		if (!netif_carrier_ok(dev)) {
+			printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, "
+					 "flow control %sabled\n", dev->name,
+					 speed, duplex ? "full" : "half",
+					 fc ? "en" : "dis");
+			netif_carrier_on(dev);
+		}
+	}
+}
+
+static struct dsa_switch_driver mv88e6060_switch_driver = {
+	.tag_protocol	= htons(ETH_P_TRAILER),
+	.probe		= mv88e6060_probe,
+	.setup		= mv88e6060_setup,
+	.set_addr	= mv88e6060_set_addr,
+	.phy_read	= mv88e6060_phy_read,
+	.phy_write	= mv88e6060_phy_write,
+	.poll_link	= mv88e6060_poll_link,
+};
+
+int __init mv88e6060_init(void)
+{
+	register_switch_driver(&mv88e6060_switch_driver);
+	return 0;
+}
+module_init(mv88e6060_init);
+
+void __exit mv88e6060_cleanup(void)
+{
+	unregister_switch_driver(&mv88e6060_switch_driver);
+}
+module_exit(mv88e6060_cleanup);
-- 
cgit v1.2.3


From 45cec1bac0719c904bb5f4405c2937f7e715888c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 8 Oct 2008 17:33:01 -0700
Subject: dsa: Need to select PHYLIB.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 3f2fd397926..cdce4c6c672 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -2,6 +2,7 @@ menuconfig NET_DSA
 	bool "Distributed Switch Architecture support"
 	default n
 	depends on EXPERIMENTAL
+	select PHYLIB
 	---help---
 	  This allows you to use hardware switch chips that use
 	  the Distributed Switch Architecture.
-- 
cgit v1.2.3


From c95b819ad75b13102139aad0e7062d927be23cc6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 9 Oct 2008 11:58:54 -0700
Subject: gre: Use needed_headroom

Now that we have dev->needed_headroom, we can use it instead of
having a bogus dev->hard_header_len.  This also allows us to
include dev->hard_header_len in the MTU computation so that when
we do have a meaningful hard_harder_len in future it is included
automatically in figuring out the MTU.

Incidentally, this fixes a bug where we ignored the needed_headroom
field of the underlying device in calculating our own hard_header_len.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2a61158ea72..fd192d67695 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -637,7 +637,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	df = tiph->frag_off;
 	if (df)
-		mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
+		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
 	else
 		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
 
@@ -785,7 +785,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev)
 	tunnel = netdev_priv(dev);
 	iph = &tunnel->parms.iph;
 
-	/* Guess output device to choose reasonable mtu and hard_header_len */
+	/* Guess output device to choose reasonable mtu and needed_headroom */
 
 	if (iph->daddr) {
 		struct flowi fl = { .oif = tunnel->parms.link,
@@ -806,7 +806,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev)
 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 
 	if (tdev) {
-		hlen = tdev->hard_header_len;
+		hlen = tdev->hard_header_len + tdev->needed_headroom;
 		mtu = tdev->mtu;
 	}
 	dev->iflink = tunnel->parms.link;
@@ -820,8 +820,8 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev)
 		if (tunnel->parms.o_flags&GRE_SEQ)
 			addend += 4;
 	}
-	dev->hard_header_len = hlen + addend;
-	dev->mtu = mtu - addend;
+	dev->needed_headroom = addend + hlen;
+	dev->mtu = mtu - dev->hard_header_len - addend;
 	tunnel->hlen = addend;
 
 }
@@ -959,7 +959,8 @@ done:
 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
+	if (new_mtu < 68 ||
+	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
 		return -EINVAL;
 	dev->mtu = new_mtu;
 	return 0;
@@ -1085,7 +1086,7 @@ static void ipgre_tunnel_setup(struct net_device *dev)
 	dev->change_mtu		= ipgre_tunnel_change_mtu;
 
 	dev->type		= ARPHRD_IPGRE;
-	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
 	dev->flags		= IFF_NOARP;
 	dev->iflink		= 0;
-- 
cgit v1.2.3


From 42aa916265d740d66ac1f17290366e9494c884c2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 9 Oct 2008 11:59:32 -0700
Subject: gre: Move MTU setting out of ipgre_tunnel_bind_dev

This patch moves the dev->mtu setting out of ipgre_tunnel_bind_dev.
This is in prepartion of using rtnl_link where we'll need to make
the MTU setting conditional on whether the user has supplied an
MTU.  This also requires the move of the ipgre_tunnel_bind_dev
call out of the dev->init function so that we can access the user
parameters later.

This patch also adds a check to prevent setting the MTU below
the minimum of 68.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index fd192d67695..80622dd6fb3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -119,6 +119,7 @@
 
 static int ipgre_tunnel_init(struct net_device *dev);
 static void ipgre_tunnel_setup(struct net_device *dev);
+static int ipgre_tunnel_bind_dev(struct net_device *dev);
 
 /* Fallback tunnel: no source, no destination, no key, no options */
 
@@ -289,6 +290,8 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
 	nt = netdev_priv(dev);
 	nt->parms = *parms;
 
+	dev->mtu = ipgre_tunnel_bind_dev(dev);
+
 	if (register_netdevice(dev) < 0)
 		goto failed_free;
 
@@ -773,7 +776,7 @@ tx_error:
 	return 0;
 }
 
-static void ipgre_tunnel_bind_dev(struct net_device *dev)
+static int ipgre_tunnel_bind_dev(struct net_device *dev)
 {
 	struct net_device *tdev = NULL;
 	struct ip_tunnel *tunnel;
@@ -821,9 +824,14 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev)
 			addend += 4;
 	}
 	dev->needed_headroom = addend + hlen;
-	dev->mtu = mtu - dev->hard_header_len - addend;
+	mtu -= dev->hard_header_len - addend;
+
+	if (mtu < 68)
+		mtu = 68;
+
 	tunnel->hlen = addend;
 
+	return mtu;
 }
 
 static int
@@ -917,7 +925,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 				t->parms.iph.frag_off = p.iph.frag_off;
 				if (t->parms.link != p.link) {
 					t->parms.link = p.link;
-					ipgre_tunnel_bind_dev(dev);
+					dev->mtu = ipgre_tunnel_bind_dev(dev);
 					netdev_state_change(dev);
 				}
 			}
@@ -1108,8 +1116,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
 
-	ipgre_tunnel_bind_dev(dev);
-
 	if (iph->daddr) {
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 		if (ipv4_is_multicast(iph->daddr)) {
-- 
cgit v1.2.3


From c19e654ddbe3831252f61e76a74d661e1a755530 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 9 Oct 2008 11:59:55 -0700
Subject: gre: Add netlink interface

This patch adds a netlink interface that will eventually displace
the existing ioctl interface.  It utilises the elegant rtnl_link_ops
mechanism.

This also means that user-space no longer needs to rely on the
tunnel interface being of type GRE to identify GRE tunnels.  The
identification can now occur using rtnl_link_ops.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 247 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 243 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 80622dd6fb3..25d2c77a7f3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -41,6 +41,7 @@
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/rtnetlink.h>
 
 #ifdef CONFIG_IPV6
 #include <net/ipv6.h>
@@ -117,6 +118,7 @@
    Alexey Kuznetsov.
  */
 
+static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void ipgre_tunnel_setup(struct net_device *dev);
 static int ipgre_tunnel_bind_dev(struct net_device *dev);
@@ -286,9 +288,9 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
 			goto failed_free;
 	}
 
-	dev->init = ipgre_tunnel_init;
 	nt = netdev_priv(dev);
 	nt->parms = *parms;
+	dev->rtnl_link_ops = &ipgre_link_ops;
 
 	dev->mtu = ipgre_tunnel_bind_dev(dev);
 
@@ -1087,6 +1089,7 @@ static int ipgre_close(struct net_device *dev)
 
 static void ipgre_tunnel_setup(struct net_device *dev)
 {
+	dev->init		= ipgre_tunnel_init;
 	dev->uninit		= ipgre_tunnel_uninit;
 	dev->destructor 	= free_netdev;
 	dev->hard_start_xmit	= ipgre_tunnel_xmit;
@@ -1196,6 +1199,7 @@ static int ipgre_init_net(struct net *net)
 
 	ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
 	dev_net_set(ign->fb_tunnel_dev, net);
+	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
 
 	if ((err = register_netdev(ign->fb_tunnel_dev)))
 		goto err_reg_dev;
@@ -1228,6 +1232,229 @@ static struct pernet_operations ipgre_net_ops = {
 	.exit = ipgre_exit_net,
 };
 
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	__be16 flags;
+
+	if (!data)
+		return 0;
+
+	flags = 0;
+	if (data[IFLA_GRE_IFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (data[IFLA_GRE_OFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	if (flags & (GRE_VERSION|GRE_ROUTING))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void ipgre_netlink_parms(struct nlattr *data[],
+				struct ip_tunnel_parm *parms)
+{
+	memset(parms, 0, sizeof(parms));
+
+	parms->iph.protocol = IPPROTO_GRE;
+
+	if (!data)
+		return;
+
+	if (data[IFLA_GRE_LINK])
+		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+	if (data[IFLA_GRE_IFLAGS])
+		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+
+	if (data[IFLA_GRE_OFLAGS])
+		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+
+	if (data[IFLA_GRE_IKEY])
+		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+	if (data[IFLA_GRE_OKEY])
+		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+	if (data[IFLA_GRE_LOCAL])
+		memcpy(&parms->iph.saddr, nla_data(data[IFLA_GRE_LOCAL]), 4);
+
+	if (data[IFLA_GRE_REMOTE])
+		memcpy(&parms->iph.daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+
+	if (data[IFLA_GRE_TTL])
+		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
+
+	if (data[IFLA_GRE_TOS])
+		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
+
+	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
+		parms->iph.frag_off = htons(IP_DF);
+}
+
+static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
+			 struct nlattr *data[])
+{
+	struct ip_tunnel *nt;
+	struct net *net = dev_net(dev);
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	int mtu;
+	int err;
+
+	nt = netdev_priv(dev);
+	ipgre_netlink_parms(data, &nt->parms);
+
+	if (ipgre_tunnel_locate(net, &nt->parms, 0))
+		return -EEXIST;
+
+	mtu = ipgre_tunnel_bind_dev(dev);
+	if (!tb[IFLA_MTU])
+		dev->mtu = mtu;
+
+	err = register_netdevice(dev);
+	if (err)
+		goto out;
+
+	dev_hold(dev);
+	ipgre_tunnel_link(ign, nt);
+
+out:
+	return err;
+}
+
+static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
+			    struct nlattr *data[])
+{
+	struct ip_tunnel *t, *nt;
+	struct net *net = dev_net(dev);
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	struct ip_tunnel_parm p;
+	int mtu;
+
+	if (dev == ign->fb_tunnel_dev)
+		return -EINVAL;
+
+	nt = netdev_priv(dev);
+	ipgre_netlink_parms(data, &p);
+
+	t = ipgre_tunnel_locate(net, &p, 0);
+
+	if (t) {
+		if (t->dev != dev)
+			return -EEXIST;
+	} else {
+		unsigned nflags = 0;
+
+		t = nt;
+
+		if (ipv4_is_multicast(p.iph.daddr))
+			nflags = IFF_BROADCAST;
+		else if (p.iph.daddr)
+			nflags = IFF_POINTOPOINT;
+
+		if ((dev->flags ^ nflags) &
+		    (IFF_POINTOPOINT | IFF_BROADCAST))
+			return -EINVAL;
+
+		ipgre_tunnel_unlink(ign, t);
+		t->parms.iph.saddr = p.iph.saddr;
+		t->parms.iph.daddr = p.iph.daddr;
+		t->parms.i_key = p.i_key;
+		memcpy(dev->dev_addr, &p.iph.saddr, 4);
+		memcpy(dev->broadcast, &p.iph.daddr, 4);
+		ipgre_tunnel_link(ign, t);
+		netdev_state_change(dev);
+	}
+
+	t->parms.o_key = p.o_key;
+	t->parms.iph.ttl = p.iph.ttl;
+	t->parms.iph.tos = p.iph.tos;
+	t->parms.iph.frag_off = p.iph.frag_off;
+
+	if (t->parms.link != p.link) {
+		t->parms.link = p.link;
+		mtu = ipgre_tunnel_bind_dev(dev);
+		if (!tb[IFLA_MTU])
+			dev->mtu = mtu;
+		netdev_state_change(dev);
+	}
+
+	return 0;
+}
+
+static size_t ipgre_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_GRE_LINK */
+		nla_total_size(4) +
+		/* IFLA_GRE_IFLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_OFLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_IKEY */
+		nla_total_size(4) +
+		/* IFLA_GRE_OKEY */
+		nla_total_size(4) +
+		/* IFLA_GRE_LOCAL */
+		nla_total_size(4) +
+		/* IFLA_GRE_REMOTE */
+		nla_total_size(4) +
+		/* IFLA_GRE_TTL */
+		nla_total_size(1) +
+		/* IFLA_GRE_TOS */
+		nla_total_size(1) +
+		/* IFLA_GRE_PMTUDISC */
+		nla_total_size(1) +
+		0;
+}
+
+static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm *p = &t->parms;
+
+	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
+	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
+	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
+	NLA_PUT_BE32(skb, IFLA_GRE_IFLAGS, p->i_flags);
+	NLA_PUT_BE32(skb, IFLA_GRE_OFLAGS, p->o_flags);
+	NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr);
+	NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr);
+	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
+	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
+	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
+	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
+	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
+	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
+	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
+	[IFLA_GRE_LOCAL]	= { .len = 4 },
+	[IFLA_GRE_REMOTE]	= { .len = 4 },
+	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
+	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
+	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
+};
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
+	.kind		= "gre",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= ipgre_tunnel_setup,
+	.validate	= ipgre_tunnel_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+};
+
 /*
  *	And now the modules code and kernel interface.
  */
@@ -1245,19 +1472,31 @@ static int __init ipgre_init(void)
 
 	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
 	if (err < 0)
-		inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+		goto gen_device_failed;
 
+	err = rtnl_link_register(&ipgre_link_ops);
+	if (err < 0)
+		goto rtnl_link_failed;
+
+out:
 	return err;
+
+rtnl_link_failed:
+	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
+gen_device_failed:
+	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+	goto out;
 }
 
 static void __exit ipgre_fini(void)
 {
+	rtnl_link_unregister(&ipgre_link_ops);
+	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
-
-	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
 }
 
 module_init(ipgre_init);
 module_exit(ipgre_fini);
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("rtnl-link-gre");
-- 
cgit v1.2.3


From e1a8000228e16212c93b23cfbed4d622e2ec7a6b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 9 Oct 2008 12:00:17 -0700
Subject: gre: Add Transparent Ethernet Bridging

This patch adds support for Ethernet over GRE encapsulation.
This is exposed to user-space with a new link type of "gretap"
instead of "gre".  It will create an ARPHRD_ETHER device in
lieu of the usual ARPHRD_IPGRE.

Note that to preserver backwards compatibility all Transparent
Ethernet Bridging packets are passed to an ARPHRD_IPGRE tunnel
if its key matches and there is no ARPHRD_ETHER device whose
key matches more closely.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 206 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 174 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 25d2c77a7f3..44ed9487fa1 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -27,6 +27,7 @@
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
 #include <linux/if_ether.h>
 
 #include <net/sock.h>
@@ -166,38 +167,64 @@ static DEFINE_RWLOCK(ipgre_lock);
 /* Given src, dst and key, find appropriate for input tunnel. */
 
 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
-		__be32 remote, __be32 local, __be32 key)
+					      __be32 remote, __be32 local,
+					      __be32 key, __be16 gre_proto)
 {
 	unsigned h0 = HASH(remote);
 	unsigned h1 = HASH(key);
 	struct ip_tunnel *t;
+	struct ip_tunnel *t2 = NULL;
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
+		       ARPHRD_ETHER : ARPHRD_IPGRE;
 
 	for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
-			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
-				return t;
+			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
+				if (t->dev->type == dev_type)
+					return t;
+				if (t->dev->type == ARPHRD_IPGRE && !t2)
+					t2 = t;
+			}
 		}
 	}
+
 	for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
 		if (remote == t->parms.iph.daddr) {
-			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
-				return t;
+			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
+				if (t->dev->type == dev_type)
+					return t;
+				if (t->dev->type == ARPHRD_IPGRE && !t2)
+					t2 = t;
+			}
 		}
 	}
+
 	for (t = ign->tunnels_l[h1]; t; t = t->next) {
 		if (local == t->parms.iph.saddr ||
 		     (local == t->parms.iph.daddr &&
 		      ipv4_is_multicast(local))) {
-			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
-				return t;
+			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
+				if (t->dev->type == dev_type)
+					return t;
+				if (t->dev->type == ARPHRD_IPGRE && !t2)
+					t2 = t;
+			}
 		}
 	}
+
 	for (t = ign->tunnels_wc[h1]; t; t = t->next) {
-		if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
-			return t;
+		if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
+			if (t->dev->type == dev_type)
+				return t;
+			if (t->dev->type == ARPHRD_IPGRE && !t2)
+				t2 = t;
+		}
 	}
 
+	if (t2)
+		return t2;
+
 	if (ign->fb_tunnel_dev->flags&IFF_UP)
 		return netdev_priv(ign->fb_tunnel_dev);
 	return NULL;
@@ -252,25 +279,37 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 	}
 }
 
-static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
-		struct ip_tunnel_parm *parms, int create)
+static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
+					   struct ip_tunnel_parm *parms,
+					   int type)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
 	__be32 key = parms->i_key;
-	struct ip_tunnel *t, **tp, *nt;
+	struct ip_tunnel *t, **tp;
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+
+	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr &&
+		    key == t->parms.i_key &&
+		    type == t->dev->type)
+			break;
+
+	return t;
+}
+
+static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
+		struct ip_tunnel_parm *parms, int create)
+{
+	struct ip_tunnel *t, *nt;
 	struct net_device *dev;
 	char name[IFNAMSIZ];
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 
-	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
-		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
-			if (key == t->parms.i_key)
-				return t;
-		}
-	}
-	if (!create)
-		return NULL;
+	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
+	if (t || !create)
+		return t;
 
 	if (parms->name[0])
 		strlcpy(name, parms->name, IFNAMSIZ);
@@ -385,8 +424,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 
 	read_lock(&ipgre_lock);
 	t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
-			(flags&GRE_KEY) ?
-			*(((__be32*)p) + (grehlen>>2) - 1) : 0);
+				flags & GRE_KEY ?
+				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
+				p[1]);
 	if (t == NULL || t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
 		goto out;
@@ -436,6 +476,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 	u32    seqno = 0;
 	struct ip_tunnel *tunnel;
 	int    offset = 4;
+	__be16 gre_proto;
 
 	if (!pskb_may_pull(skb, 16))
 		goto drop_nolock;
@@ -475,20 +516,22 @@ static int ipgre_rcv(struct sk_buff *skb)
 		}
 	}
 
+	gre_proto = *(__be16 *)(h + 2);
+
 	read_lock(&ipgre_lock);
 	if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
-					iph->saddr, iph->daddr, key)) != NULL) {
+					  iph->saddr, iph->daddr, key,
+					  gre_proto))) {
 		struct net_device_stats *stats = &tunnel->dev->stats;
 
 		secpath_reset(skb);
 
-		skb->protocol = *(__be16*)(h + 2);
+		skb->protocol = gre_proto;
 		/* WCCP version 1 and 2 protocol decoding.
 		 * - Change protocol to IP
 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 		 */
-		if (flags == 0 &&
-		    skb->protocol == htons(ETH_P_WCCP)) {
+		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 			skb->protocol = htons(ETH_P_IP);
 			if ((*(h + offset) & 0xF0) != 0x40)
 				offset += 4;
@@ -496,7 +539,6 @@ static int ipgre_rcv(struct sk_buff *skb)
 
 		skb->mac_header = skb->network_header;
 		__pskb_pull(skb, offset);
-		skb_reset_network_header(skb);
 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 		skb->pkt_type = PACKET_HOST;
 #ifdef CONFIG_NET_IPGRE_BROADCAST
@@ -524,13 +566,30 @@ static int ipgre_rcv(struct sk_buff *skb)
 			}
 			tunnel->i_seqno = seqno + 1;
 		}
+
+		/* Warning: All skb pointers will be invalidated! */
+		if (tunnel->dev->type == ARPHRD_ETHER) {
+			if (!pskb_may_pull(skb, ETH_HLEN)) {
+				stats->rx_length_errors++;
+				stats->rx_errors++;
+				goto drop;
+			}
+
+			iph = ip_hdr(skb);
+			skb->protocol = eth_type_trans(skb, tunnel->dev);
+			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+		}
+
 		stats->rx_packets++;
 		stats->rx_bytes += skb->len;
 		skb->dev = tunnel->dev;
 		dst_release(skb->dst);
 		skb->dst = NULL;
 		nf_reset(skb);
+
+		skb_reset_network_header(skb);
 		ipgre_ecn_decapsulate(iph, skb);
+
 		netif_rx(skb);
 		read_unlock(&ipgre_lock);
 		return(0);
@@ -565,7 +624,10 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_error;
 	}
 
-	if (dev->header_ops) {
+	if (dev->type == ARPHRD_ETHER)
+		IPCB(skb)->flags = 0;
+
+	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 		gre_hlen = 0;
 		tiph = (struct iphdr*)skb->data;
 	} else {
@@ -741,8 +803,9 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
 	}
 
-	((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
-	((__be16*)(iph+1))[1] = skb->protocol;
+	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
+	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
+				   htons(ETH_P_TEB) : skb->protocol;
 
 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
@@ -804,7 +867,9 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
 			tdev = rt->u.dst.dev;
 			ip_rt_put(rt);
 		}
-		dev->flags |= IFF_POINTOPOINT;
+
+		if (dev->type != ARPHRD_ETHER)
+			dev->flags |= IFF_POINTOPOINT;
 	}
 
 	if (!tdev && tunnel->parms.link)
@@ -1250,6 +1315,30 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	__be32 daddr;
+
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+			return -EINVAL;
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+			return -EADDRNOTAVAIL;
+	}
+
+	if (!data)
+		goto out;
+
+	if (data[IFLA_GRE_REMOTE]) {
+		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+		if (!daddr)
+			return -EINVAL;
+	}
+
+out:
+	return ipgre_tunnel_validate(tb, data);
+}
+
 static void ipgre_netlink_parms(struct nlattr *data[],
 				struct ip_tunnel_parm *parms)
 {
@@ -1291,6 +1380,35 @@ static void ipgre_netlink_parms(struct nlattr *data[],
 		parms->iph.frag_off = htons(IP_DF);
 }
 
+static int ipgre_tap_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel;
+
+	tunnel = netdev_priv(dev);
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	ipgre_tunnel_bind_dev(dev);
+
+	return 0;
+}
+
+static void ipgre_tap_setup(struct net_device *dev)
+{
+
+	ether_setup(dev);
+
+	dev->init		= ipgre_tap_init;
+	dev->uninit		= ipgre_tunnel_uninit;
+	dev->destructor 	= free_netdev;
+	dev->hard_start_xmit	= ipgre_tunnel_xmit;
+	dev->change_mtu		= ipgre_tunnel_change_mtu;
+
+	dev->iflink		= 0;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+}
+
 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
 			 struct nlattr *data[])
 {
@@ -1303,9 +1421,12 @@ static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
 	nt = netdev_priv(dev);
 	ipgre_netlink_parms(data, &nt->parms);
 
-	if (ipgre_tunnel_locate(net, &nt->parms, 0))
+	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
 		return -EEXIST;
 
+	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+		random_ether_addr(dev->dev_addr);
+
 	mtu = ipgre_tunnel_bind_dev(dev);
 	if (!tb[IFLA_MTU])
 		dev->mtu = mtu;
@@ -1455,6 +1576,19 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
 	.fill_info	= ipgre_fill_info,
 };
 
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
+	.kind		= "gretap",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= ipgre_tap_setup,
+	.validate	= ipgre_tap_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+};
+
 /*
  *	And now the modules code and kernel interface.
  */
@@ -1478,9 +1612,15 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = rtnl_link_register(&ipgre_tap_ops);
+	if (err < 0)
+		goto tap_ops_failed;
+
 out:
 	return err;
 
+tap_ops_failed:
+	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
 gen_device_failed:
@@ -1490,6 +1630,7 @@ gen_device_failed:
 
 static void __exit ipgre_fini(void)
 {
+	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
@@ -1500,3 +1641,4 @@ module_init(ipgre_init);
 module_exit(ipgre_fini);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("rtnl-link-gre");
+MODULE_ALIAS("rtnl-link-gretap");
-- 
cgit v1.2.3


From 64194c31a0b6f5d84703b772113aafc400eeaad6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 9 Oct 2008 12:03:17 -0700
Subject: inet: Make tunnel RX/TX byte counters more consistent

This patch makes the RX/TX byte counters for IPIP, GRE and SIT more
consistent.  Previously we included the external IP headers on the
way out but not when the packet is inbound.

The new scheme is to count payload only in both directions.  For
IPIP and SIT this simply means the exclusion of the external IP
header.  For GRE this means that we exclude the GRE header as
well.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 44ed9487fa1..0d5e35b0ed5 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -477,6 +477,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 	struct ip_tunnel *tunnel;
 	int    offset = 4;
 	__be16 gre_proto;
+	unsigned int len;
 
 	if (!pskb_may_pull(skb, 16))
 		goto drop_nolock;
@@ -567,6 +568,8 @@ static int ipgre_rcv(struct sk_buff *skb)
 			tunnel->i_seqno = seqno + 1;
 		}
 
+		len = skb->len;
+
 		/* Warning: All skb pointers will be invalidated! */
 		if (tunnel->dev->type == ARPHRD_ETHER) {
 			if (!pskb_may_pull(skb, ETH_HLEN)) {
@@ -581,7 +584,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 		}
 
 		stats->rx_packets++;
-		stats->rx_bytes += skb->len;
+		stats->rx_bytes += len;
 		skb->dev = tunnel->dev;
 		dst_release(skb->dst);
 		skb->dst = NULL;
@@ -770,7 +773,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		old_iph = ip_hdr(skb);
 	}
 
-	skb->transport_header = skb->network_header;
+	skb_reset_transport_header(skb);
 	skb_push(skb, gre_hlen);
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-- 
cgit v1.2.3


From 3d5a019d5761a40465337711ae7d2beb1e9b43ec Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 9 Oct 2008 14:32:24 -0700
Subject: sctp: Fix the SNMP number of SCTP_MIB_CURRESTAB

RFC3873 defined SCTP_MIB_CURRESTAB:
  sctpCurrEstab OBJECT-TYPE
    SYNTAX         Gauge32
    MAX-ACCESS     read-only
    STATUS         current
    DESCRIPTION
         "The number of associations for which the current state is
         either ESTABLISHED, SHUTDOWN-RECEIVED or SHUTDOWN-PENDING."
    REFERENCE
         "Section 4 in RFC2960 covers the SCTP   Association state
         diagram."

If the T4 RTO timer expires many times(timeout), the association will enter
CLOSED state, so we should dec the number of SCTP_MIB_CURRESTAB, not inc the
number of SCTP_MIB_CURRESTAB.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index ea3a34cbe47..1588d063c68 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5436,7 +5436,7 @@ sctp_disposition_t sctp_sf_t4_timer_expire(
 		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
 				SCTP_PERR(SCTP_ERROR_NO_ERROR));
 		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
-		SCTP_INC_STATS(SCTP_MIB_CURRESTAB);
+		SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
 		return SCTP_DISPOSITION_ABORT;
 	}
 
-- 
cgit v1.2.3


From 56eb82bb8d2cdd8d9f4838eaa109df41d7164ca5 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Thu, 9 Oct 2008 14:33:01 -0700
Subject: sctp: Fix SNMP number of SCTP_MIB_ABORTED during violation handling.

If ABORT chunks require authentication and a protocol violation
is triggered, we do not tear down the association.  Subsequently,
we should not increment SCTP_MIB_ABORTED.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 1588d063c68..f5094f1b5af 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4188,11 +4188,10 @@ static sctp_disposition_t sctp_sf_abort_violation(
 		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
 	}
 
-discard:
-	sctp_sf_pdiscard(ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
-
 	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
 
+discard:
+	sctp_sf_pdiscard(ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
 	return SCTP_DISPOSITION_ABORT;
 
 nomem_pkt:
@@ -4265,12 +4264,10 @@ static sctp_disposition_t sctp_sf_violation_paramlen(
 	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
 			SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
 	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
 
 discard:
 	sctp_sf_pdiscard(ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
-
-	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
-
 	return SCTP_DISPOSITION_ABORT;
 nomem:
 	return SCTP_DISPOSITION_NOMEM;
-- 
cgit v1.2.3


From a1080a8b0bc301c223c4bf0cea4c5e42f43dcf58 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Thu, 9 Oct 2008 14:33:26 -0700
Subject: sctp: update SNMP statiscts when T5 timer expired.

The T5 timer is the timer for the over-all shutdown procedure.  If
this timer expires, then shutdown procedure has not completed and we
ABORT the association.  We should update SCTP_MIB_ABORTED and
SCTP_MIB_CURRESTAB  when aborting.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index f5094f1b5af..d4c3fbc4671 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5489,6 +5489,9 @@ sctp_disposition_t sctp_sf_t5_timer_expire(const struct sctp_endpoint *ep,
 	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
 			SCTP_PERR(SCTP_ERROR_NO_ERROR));
 
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+
 	return SCTP_DISPOSITION_DELETE_TCB;
 nomem:
 	return SCTP_DISPOSITION_NOMEM;
-- 
cgit v1.2.3


From 78e645cb890b0f32ea81a974e29427d9cd2f64f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Thu, 9 Oct 2008 14:37:47 -0700
Subject: tcpv[46]: fix md5 pseudoheader address field ordering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Maybe it's just me but I guess those md5 people made a mess
out of it by having *_md5_hash_* to use daddr, saddr order
instead of the one that is natural (and equal to what csum
functions use). For the segment were sending, the original
addresses are reversed so buff's saddr == skb's daddr and
vice-versa.

Maybe I can finally proceed with unification of some code
after fixing it first... :-)

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 ++--
 net/ipv6/tcp_ipv6.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ba46769c6e9..5c8fa7f1e32 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -583,8 +583,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 		rep.th.doff = arg.iov[0].iov_len / 4;
 
 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
-				     key, ip_hdr(skb)->daddr,
-				     ip_hdr(skb)->saddr, &rep.th);
+				     key, ip_hdr(skb)->saddr,
+				     ip_hdr(skb)->daddr, &rep.th);
 	}
 #endif
 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index dd7bdde7bdd..eab10bc7ff8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1007,8 +1007,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 			       (TCPOPT_MD5SIG << 8) |
 			       TCPOLEN_MD5SIG);
 		tcp_v6_md5_hash_hdr((__u8 *)&opt[1], key,
-				    &ipv6_hdr(skb)->daddr,
-				    &ipv6_hdr(skb)->saddr, t1);
+				    &ipv6_hdr(skb)->saddr,
+				    &ipv6_hdr(skb)->daddr, t1);
 	}
 #endif
 
-- 
cgit v1.2.3


From 77c676da1b717eed7239144fb539dfc4c7b78e04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Thu, 9 Oct 2008 14:41:38 -0700
Subject: tcpv6: trivial formatting changes to send_(ack|reset)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index eab10bc7ff8..910603cd12d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -949,7 +949,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	struct flowi fl;
 	struct net *net = dev_net(skb->dst->dev);
 	struct sock *ctl_sk = net->ipv6.tcp_sk;
-	unsigned int tot_len = sizeof(*th);
+	unsigned int tot_len = sizeof(struct tcphdr);
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key *key;
 #endif
@@ -1033,7 +1033,6 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	 * namespace
 	 */
 	if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) {
-
 		if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
 			ip6_xmit(ctl_sk, buff, &fl, NULL, 0);
 			TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -1070,13 +1069,13 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
 
 	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
 
-	t1 = (struct tcphdr *) skb_push(buff,tot_len);
+	t1 = (struct tcphdr *) skb_push(buff, tot_len);
 
 	/* Swap the send and the receive. */
 	memset(t1, 0, sizeof(*t1));
 	t1->dest = th->source;
 	t1->source = th->dest;
-	t1->doff = tot_len/4;
+	t1->doff = tot_len / 4;
 	t1->seq = htonl(seq);
 	t1->ack_seq = htonl(ack);
 	t1->ack = 1;
-- 
cgit v1.2.3


From 81ada62d70060023923f46ab666cdc2970e1e0ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Thu, 9 Oct 2008 14:42:01 -0700
Subject: tcpv6: convert opt[] -> topt in tcp_v6_send_reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

after this I get:

$ diff-funcs tcp_v6_send_reset tcp_ipv6.c tcp_ipv6.c tcp_v6_send_ack
 --- tcp_ipv6.c:tcp_v6_send_reset()
 +++ tcp_ipv6.c:tcp_v6_send_ack()
@@ -1,4 +1,5 @@
-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
u32 ts,
+                           struct tcp_md5sig_key *key)
 {
        struct tcphdr *th = tcp_hdr(skb), *t1;
        struct sk_buff *buff;
@@ -7,31 +8,14 @@
        struct sock *ctl_sk = net->ipv6.tcp_sk;
        unsigned int tot_len = sizeof(struct tcphdr);
        __be32 *topt;
-#ifdef CONFIG_TCP_MD5SIG
-       struct tcp_md5sig_key *key;
-#endif
-
-       if (th->rst)
-               return;
-
-       if (!ipv6_unicast_destination(skb))
-               return;

+       if (ts)
+               tot_len += TCPOLEN_TSTAMP_ALIGNED;
 #ifdef CONFIG_TCP_MD5SIG
-       if (sk)
-               key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr);
-       else
-               key = NULL;
-
        if (key)
                tot_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif

-       /*
-        * We need to grab some memory, and put together an RST,
-        * and then put it into the queue to be sent.
-        */
-
        buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
                         GFP_ATOMIC);
        if (buff == NULL)
@@ -46,18 +30,20 @@
        t1->dest = th->source;
        t1->source = th->dest;
        t1->doff = tot_len / 4;
-       t1->rst = 1;
-
-       if(th->ack) {
-               t1->seq = th->ack_seq;
-       } else {
-               t1->ack = 1;
-               t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
-                                   + skb->len - (th->doff<<2));
-       }
+       t1->seq = htonl(seq);
+       t1->ack_seq = htonl(ack);
+       t1->ack = 1;
+       t1->window = htons(win);

        topt = (__be32 *)(t1 + 1);

+       if (ts) {
+               *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+                               (TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
+               *topt++ = htonl(tcp_time_stamp);
+               *topt++ = htonl(ts);
+       }
+
 #ifdef CONFIG_TCP_MD5SIG
        if (key) {
                *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
@@ -84,15 +70,10 @@
        fl.fl_ip_sport = t1->source;
        security_skb_classify_flow(skb, &fl);

-       /* Pass a socket to ip6_dst_lookup either it is for RST
-        * Underlying function will use this to retrieve the network
-        * namespace
-        */
        if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) {
                if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
                        ip6_xmit(ctl_sk, buff, &fl, NULL, 0);
                        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
-                       TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
                        return;
                }
        }


...which starts to be trivial to combine.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 910603cd12d..1941c5c888d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -950,6 +950,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	struct net *net = dev_net(skb->dst->dev);
 	struct sock *ctl_sk = net->ipv6.tcp_sk;
 	unsigned int tot_len = sizeof(struct tcphdr);
+	__be32 *topt;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key *key;
 #endif
@@ -999,14 +1000,13 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 				    + skb->len - (th->doff<<2));
 	}
 
+	topt = (__be32 *)(t1 + 1);
+
 #ifdef CONFIG_TCP_MD5SIG
 	if (key) {
-		__be32 *opt = (__be32*)(t1 + 1);
-		opt[0] = htonl((TCPOPT_NOP << 24) |
-			       (TCPOPT_NOP << 16) |
-			       (TCPOPT_MD5SIG << 8) |
-			       TCPOLEN_MD5SIG);
-		tcp_v6_md5_hash_hdr((__u8 *)&opt[1], key,
+		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+				(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+		tcp_v6_md5_hash_hdr((__u8 *)topt, key,
 				    &ipv6_hdr(skb)->saddr,
 				    &ipv6_hdr(skb)->daddr, t1);
 	}
-- 
cgit v1.2.3


From 626e264dd1989bdc98a5eaf2e059af4dba07ac4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Thu, 9 Oct 2008 14:42:40 -0700
Subject: tcpv6: combine tcp_v6_send_(reset|ack)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

$ codiff tcp_ipv6.o.old tcp_ipv6.o.new
net/ipv6/tcp_ipv6.c:
  tcp_v6_md5_hash_hdr | -144
  tcp_v6_send_ack     | -585
  tcp_v6_send_reset   | -540
 3 functions changed, 1269 bytes removed, diff: -1269

net/ipv6/tcp_ipv6.c:
  tcp_v6_send_response | +791
 1 function changed, 791 bytes added, diff: +791

tcp_ipv6.o.new:
 4 functions changed, 791 bytes added, 1269 bytes removed, diff: -478

I choose to leave the reset related netns comment in place (not
the one that is killed) as I cannot understand its English so
it's a bit hard for me to evaluate its usefulness :-).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 139 +++++++++++++++-------------------------------------
 1 file changed, 40 insertions(+), 99 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1941c5c888d..13c65144a00 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -942,7 +942,8 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb)
 	return 0;
 }
 
-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
+				 u32 ts, struct tcp_md5sig_key *key, int rst)
 {
 	struct tcphdr *th = tcp_hdr(skb), *t1;
 	struct sk_buff *buff;
@@ -951,31 +952,14 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	struct sock *ctl_sk = net->ipv6.tcp_sk;
 	unsigned int tot_len = sizeof(struct tcphdr);
 	__be32 *topt;
-#ifdef CONFIG_TCP_MD5SIG
-	struct tcp_md5sig_key *key;
-#endif
-
-	if (th->rst)
-		return;
-
-	if (!ipv6_unicast_destination(skb))
-		return;
 
+	if (ts)
+		tot_len += TCPOLEN_TSTAMP_ALIGNED;
 #ifdef CONFIG_TCP_MD5SIG
-	if (sk)
-		key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr);
-	else
-		key = NULL;
-
 	if (key)
 		tot_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
 
-	/*
-	 * We need to grab some memory, and put together an RST,
-	 * and then put it into the queue to be sent.
-	 */
-
 	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
 			 GFP_ATOMIC);
 	if (buff == NULL)
@@ -990,18 +974,21 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	t1->dest = th->source;
 	t1->source = th->dest;
 	t1->doff = tot_len / 4;
-	t1->rst = 1;
-
-	if(th->ack) {
-		t1->seq = th->ack_seq;
-	} else {
-		t1->ack = 1;
-		t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
-				    + skb->len - (th->doff<<2));
-	}
+	t1->seq = htonl(seq);
+	t1->ack_seq = htonl(ack);
+	t1->ack = !rst || !th->ack;
+	t1->rst = rst;
+	t1->window = htons(win);
 
 	topt = (__be32 *)(t1 + 1);
 
+	if (ts) {
+		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+				(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+		*topt++ = htonl(tcp_time_stamp);
+		*topt++ = htonl(ts);
+	}
+
 #ifdef CONFIG_TCP_MD5SIG
 	if (key) {
 		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
@@ -1036,7 +1023,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 		if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
 			ip6_xmit(ctl_sk, buff, &fl, NULL, 0);
 			TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
-			TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+			if (rst)
+				TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 			return;
 		}
 	}
@@ -1044,87 +1032,40 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	kfree_skb(buff);
 }
 
-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts,
-			    struct tcp_md5sig_key *key)
+static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcphdr *th = tcp_hdr(skb), *t1;
-	struct sk_buff *buff;
-	struct flowi fl;
-	struct net *net = dev_net(skb->dst->dev);
-	struct sock *ctl_sk = net->ipv6.tcp_sk;
-	unsigned int tot_len = sizeof(struct tcphdr);
-	__be32 *topt;
-
-	if (ts)
-		tot_len += TCPOLEN_TSTAMP_ALIGNED;
+	struct tcphdr *th = tcp_hdr(skb);
+	u32 seq = 0, ack_seq = 0;
 #ifdef CONFIG_TCP_MD5SIG
-	if (key)
-		tot_len += TCPOLEN_MD5SIG_ALIGNED;
+	struct tcp_md5sig_key *key;
 #endif
 
-	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
-			 GFP_ATOMIC);
-	if (buff == NULL)
+	if (th->rst)
 		return;
 
-	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
-
-	t1 = (struct tcphdr *) skb_push(buff, tot_len);
-
-	/* Swap the send and the receive. */
-	memset(t1, 0, sizeof(*t1));
-	t1->dest = th->source;
-	t1->source = th->dest;
-	t1->doff = tot_len / 4;
-	t1->seq = htonl(seq);
-	t1->ack_seq = htonl(ack);
-	t1->ack = 1;
-	t1->window = htons(win);
-
-	topt = (__be32 *)(t1 + 1);
-
-	if (ts) {
-		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
-				(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
-		*topt++ = htonl(tcp_time_stamp);
-		*topt++ = htonl(ts);
-	}
+	if (!ipv6_unicast_destination(skb))
+		return;
 
 #ifdef CONFIG_TCP_MD5SIG
-	if (key) {
-		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
-				(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
-		tcp_v6_md5_hash_hdr((__u8 *)topt, key,
-				    &ipv6_hdr(skb)->saddr,
-				    &ipv6_hdr(skb)->daddr, t1);
-	}
+	if (sk)
+		key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr);
+	else
+		key = NULL;
 #endif
 
-	buff->csum = csum_partial((char *)t1, tot_len, 0);
-
-	memset(&fl, 0, sizeof(fl));
-	ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr);
-	ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr);
-
-	t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
-				    tot_len, IPPROTO_TCP,
-				    buff->csum);
-
-	fl.proto = IPPROTO_TCP;
-	fl.oif = inet6_iif(skb);
-	fl.fl_ip_dport = t1->dest;
-	fl.fl_ip_sport = t1->source;
-	security_skb_classify_flow(skb, &fl);
+	if (th->ack)
+		seq = ntohl(th->ack_seq);
+	else
+		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
+			  (th->doff << 2);
 
-	if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) {
-		if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
-			ip6_xmit(ctl_sk, buff, &fl, NULL, 0);
-			TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
-			return;
-		}
-	}
+	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, key, 1);
+}
 
-	kfree_skb(buff);
+static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts,
+			    struct tcp_md5sig_key *key)
+{
+	tcp_v6_send_response(skb, seq, ack, win, ts, key, 0);
 }
 
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3


From f24d43c07e208372aa3d3bff419afbf43ba87698 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 9 Oct 2008 14:51:27 -0700
Subject: udp: complete port availability checking

While looking at UDP port randomization, I noticed it
was litle bit pessimistic, not looking at type of sockets
(IPV6/IPV4) and not looking at bound addresses if any.

We should perform same tests than when binding to a
specific port.

This permits a cleanup of udp_lib_get_port()

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 67d8430b4a2..eacf4cfef14 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,14 +122,23 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
 atomic_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
-static inline int __udp_lib_lport_inuse(struct net *net, __u16 num,
-					const struct hlist_head udptable[])
+static int udp_lib_lport_inuse(struct net *net, __u16 num,
+			       const struct hlist_head udptable[],
+			       struct sock *sk,
+			       int (*saddr_comp)(const struct sock *sk1,
+						 const struct sock *sk2))
 {
-	struct sock *sk;
+	struct sock *sk2;
 	struct hlist_node *node;
 
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, num)])
-		if (net_eq(sock_net(sk), net) && sk->sk_hash == num)
+	sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)])
+		if (net_eq(sock_net(sk2), net)			&&
+		    sk2 != sk					&&
+		    sk2->sk_hash == num				&&
+		    (!sk2->sk_reuse || !sk->sk_reuse)		&&
+		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
+			|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+		    (*saddr_comp)(sk, sk2))
 			return 1;
 	return 0;
 }
@@ -146,9 +155,6 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 					 const struct sock *sk2 )    )
 {
 	struct hlist_head *udptable = sk->sk_prot->h.udp_hash;
-	struct hlist_node *node;
-	struct hlist_head *head;
-	struct sock *sk2;
 	int    error = 1;
 	struct net *net = sock_net(sk);
 
@@ -165,32 +171,21 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		rand = net_random();
 		snum = first = rand % remaining + low;
 		rand |= 1;
-		while (__udp_lib_lport_inuse(net, snum, udptable)) {
+		while (udp_lib_lport_inuse(net, snum, udptable, sk,
+					   saddr_comp)) {
 			do {
 				snum = snum + rand;
 			} while (snum < low || snum > high);
 			if (snum == first)
 				goto fail;
 		}
-	} else {
-		head = &udptable[udp_hashfn(net, snum)];
-
-		sk_for_each(sk2, node, head)
-			if (sk2->sk_hash == snum                             &&
-			    sk2 != sk                                        &&
-			    net_eq(sock_net(sk2), net)			     &&
-			    (!sk2->sk_reuse        || !sk->sk_reuse)         &&
-			    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
-			     || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-			    (*saddr_comp)(sk, sk2)                             )
-				goto fail;
-	}
+	} else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp))
+		goto fail;
 
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		head = &udptable[udp_hashfn(net, snum)];
-		sk_add_node(sk, head);
+		sk_add_node(sk, &udptable[udp_hashfn(net, snum)]);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
-- 
cgit v1.2.3


From fa3e5b4eb8261ae6ee27922881093db973e9d640 Mon Sep 17 00:00:00 2001
From: Guo-Fu Tseng <cooldavid@cooldavid.org>
Date: Thu, 9 Oct 2008 21:11:56 -0700
Subject: tcpv6: fix error with CONFIG_TCP_MD5SIG disabled

This patch fix error with CONFIG_TCP_MD5SIG disabled.

Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 13c65144a00..e5310c9b84d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1036,9 +1036,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcphdr *th = tcp_hdr(skb);
 	u32 seq = 0, ack_seq = 0;
-#ifdef CONFIG_TCP_MD5SIG
-	struct tcp_md5sig_key *key;
-#endif
+	struct tcp_md5sig_key *key = NULL;
 
 	if (th->rst)
 		return;
@@ -1049,8 +1047,6 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 #ifdef CONFIG_TCP_MD5SIG
 	if (sk)
 		key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr);
-	else
-		key = NULL;
 #endif
 
 	if (th->ack)
-- 
cgit v1.2.3


From 561967010edef40f539dacf2aa125e20773ab40b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:29 -0400
Subject: netlabel: Fix some sparse warnings

Fix a few sparse warnings.  One dealt with a RCU lock being held on error,
another dealt with an improper type caused by a signed/unsigned mixup while
the rest appeared to be caused by using rcu_dereference() in a
list_for_each_entry_rcu() call.  The latter probably isn't a big deal, but
I derive a certain pleasure from knowing that the net/netlabel is nice and
clean.

Thanks to James Morris for pointing out the issues and demonstrating how
to run sparse.

Signed-off-by: Paul Moore <paul.moore@hp.com>
---
 net/netlabel/netlabel_cipso_v4.c   |  4 ++--
 net/netlabel/netlabel_domainhash.c | 12 ++++++------
 net/netlabel/netlabel_unlabeled.c  | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 0aec318bf0e..aaf50032b3a 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -491,7 +491,7 @@ list_start:
 	doi_def = cipso_v4_doi_getdef(doi);
 	if (doi_def == NULL) {
 		ret_val = -EINVAL;
-		goto list_failure;
+		goto list_failure_lock;
 	}
 
 	ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type);
@@ -655,7 +655,7 @@ static int netlbl_cipsov4_listall(struct sk_buff *skb,
 				  struct netlink_callback *cb)
 {
 	struct netlbl_cipsov4_doiwalk_arg cb_arg;
-	int doi_skip = cb->args[0];
+	u32 doi_skip = cb->args[0];
 
 	cb_arg.nl_cb = cb;
 	cb_arg.skb = skb;
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index 643c032a3a5..dc42206c431 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -115,13 +115,13 @@ static u32 netlbl_domhsh_hash(const char *key)
 static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
 {
 	u32 bkt;
+	struct list_head *bkt_list;
 	struct netlbl_dom_map *iter;
 
 	if (domain != NULL) {
 		bkt = netlbl_domhsh_hash(domain);
-		list_for_each_entry_rcu(iter,
-				     &rcu_dereference(netlbl_domhsh)->tbl[bkt],
-				     list)
+		bkt_list = &rcu_dereference(netlbl_domhsh)->tbl[bkt];
+		list_for_each_entry_rcu(iter, bkt_list, list)
 			if (iter->valid && strcmp(iter->domain, domain) == 0)
 				return iter;
 	}
@@ -410,6 +410,7 @@ int netlbl_domhsh_walk(u32 *skip_bkt,
 {
 	int ret_val = -ENOENT;
 	u32 iter_bkt;
+	struct list_head *iter_list;
 	struct netlbl_dom_map *iter_entry;
 	u32 chain_cnt = 0;
 
@@ -417,9 +418,8 @@ int netlbl_domhsh_walk(u32 *skip_bkt,
 	for (iter_bkt = *skip_bkt;
 	     iter_bkt < rcu_dereference(netlbl_domhsh)->size;
 	     iter_bkt++, chain_cnt = 0) {
-		list_for_each_entry_rcu(iter_entry,
-				&rcu_dereference(netlbl_domhsh)->tbl[iter_bkt],
-				list)
+		iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt];
+		list_for_each_entry_rcu(iter_entry, iter_list, list)
 			if (iter_entry->valid) {
 				if (chain_cnt++ < *skip_chain)
 					continue;
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 921c118ead8..cc105a10e3f 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -381,12 +381,12 @@ static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6(
 static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
 {
 	u32 bkt;
+	struct list_head *bkt_list;
 	struct netlbl_unlhsh_iface *iter;
 
 	bkt = netlbl_unlhsh_hash(ifindex);
-	list_for_each_entry_rcu(iter,
-				&rcu_dereference(netlbl_unlhsh)->tbl[bkt],
-				list)
+	bkt_list = &rcu_dereference(netlbl_unlhsh)->tbl[bkt];
+	list_for_each_entry_rcu(iter, bkt_list, list)
 		if (iter->valid && iter->ifindex == ifindex)
 			return iter;
 
@@ -1427,6 +1427,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
 	struct netlbl_unlhsh_iface *iface;
 	struct netlbl_unlhsh_addr4 *addr4;
 	struct netlbl_unlhsh_addr6 *addr6;
+	struct list_head *iter_list;
 
 	cb_arg.nl_cb = cb;
 	cb_arg.skb = skb;
@@ -1436,9 +1437,8 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
 	for (iter_bkt = skip_bkt;
 	     iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
 	     iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
-		list_for_each_entry_rcu(iface,
-			        &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt],
-				list) {
+		iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt];
+		list_for_each_entry_rcu(iface, iter_list, list) {
 			if (!iface->valid ||
 			    iter_chain++ < skip_chain)
 				continue;
-- 
cgit v1.2.3


From 948a72438d4178d0728c4b0a38836d280b846939 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:30 -0400
Subject: netlabel: Remove unneeded in-kernel API functions

After some discussions with the Smack folks, well just Casey, I now have a
better idea of what Smack wants out of NetLabel in the future so I think it
is now safe to do some API "pruning".  If another LSM comes along that
needs this functionality we can always add it back in, but I don't see any
LSMs on the horizon which might make use of these functions.

Thanks to Rami Rosen who suggested removing netlbl_cfg_cipsov4_del() back
in February 2008.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 net/netlabel/netlabel_kapi.c | 84 ++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 61 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 39793a1a93a..6c211fe9778 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -82,7 +82,7 @@ int netlbl_cfg_unlbl_add_map(const char *domain,
 
 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL)
-		goto cfg_unlbl_add_map_failure;
+		return -ENOMEM;
 	if (domain != NULL) {
 		entry->domain = kstrdup(domain, GFP_ATOMIC);
 		if (entry->domain == NULL)
@@ -103,49 +103,6 @@ cfg_unlbl_add_map_failure:
 	return ret_val;
 }
 
-/**
- * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
- * @doi_def: the DOI definition
- * @audit_info: NetLabel audit information
- *
- * Description:
- * Add a new CIPSOv4 DOI definition to the NetLabel subsystem.  Returns zero on
- * success, negative values on failure.
- *
- */
-int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
-			   struct netlbl_audit *audit_info)
-{
-	int ret_val;
-	const char *type_str;
-	struct audit_buffer *audit_buf;
-
-	ret_val = cipso_v4_doi_add(doi_def);
-
-	audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
-					      audit_info);
-	if (audit_buf != NULL) {
-		switch (doi_def->type) {
-		case CIPSO_V4_MAP_STD:
-			type_str = "std";
-			break;
-		case CIPSO_V4_MAP_PASS:
-			type_str = "pass";
-			break;
-		default:
-			type_str = "(unknown)";
-		}
-		audit_log_format(audit_buf,
-				 " cipso_doi=%u cipso_type=%s res=%u",
-				 doi_def->doi,
-				 type_str,
-				 ret_val == 0 ? 1 : 0);
-		audit_log_end(audit_buf);
-	}
-
-	return ret_val;
-}
-
 /**
  * netlbl_cfg_cipsov4_add_map - Add a new CIPSOv4 DOI definition and mapping
  * @doi_def: the DOI definition
@@ -165,10 +122,12 @@ int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
 {
 	int ret_val = -ENOMEM;
 	struct netlbl_dom_map *entry;
+	const char *type_str;
+	struct audit_buffer *audit_buf;
 
 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL)
-		goto cfg_cipsov4_add_map_failure;
+		return -ENOMEM;
 	if (domain != NULL) {
 		entry->domain = kstrdup(domain, GFP_ATOMIC);
 		if (entry->domain == NULL)
@@ -182,7 +141,7 @@ int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
 	 * domain mapping for it. */
 
 	rcu_read_lock();
-	ret_val = netlbl_cfg_cipsov4_add(doi_def, audit_info);
+	ret_val = cipso_v4_doi_add(doi_def);
 	if (ret_val != 0)
 		goto cfg_cipsov4_add_map_failure_unlock;
 	ret_val = netlbl_domhsh_add(entry, audit_info);
@@ -196,6 +155,24 @@ cfg_cipsov4_add_map_failure_remove_doi:
 	cipso_v4_doi_remove(doi_def->doi, audit_info, netlbl_cipsov4_doi_free);
 cfg_cipsov4_add_map_failure_unlock:
 	rcu_read_unlock();
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
+					      audit_info);
+	if (audit_buf != NULL) {
+		switch (doi_def->type) {
+		case CIPSO_V4_MAP_STD:
+			type_str = "std";
+			break;
+		case CIPSO_V4_MAP_PASS:
+			type_str = "pass";
+			break;
+		default:
+			type_str = "(unknown)";
+		}
+		audit_log_format(audit_buf,
+				 " cipso_doi=%u cipso_type=%s res=%u",
+				 doi_def->doi, type_str, ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
 cfg_cipsov4_add_map_failure:
 	if (entry != NULL)
 		kfree(entry->domain);
@@ -203,21 +180,6 @@ cfg_cipsov4_add_map_failure:
 	return ret_val;
 }
 
-/**
- * netlbl_cfg_cipsov4_del - Removean existing CIPSOv4 DOI definition
- * @doi: the CIPSO DOI value
- * @audit_info: NetLabel audit information
- *
- * Description:
- * Removes an existing CIPSOv4 DOI definition from the NetLabel subsystem.
- * Returns zero on success, negative values on failure.
- *
- */
-int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
-{
-	return cipso_v4_doi_remove(doi, audit_info, netlbl_cipsov4_doi_free);
-}
-
 /*
  * Security Attribute Functions
  */
-- 
cgit v1.2.3


From dfaebe9825ff34983778f287101bc5f3bce00640 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:31 -0400
Subject: selinux: Fix missing calls to netlbl_skbuff_err()

At some point I think I messed up and dropped the calls to netlbl_skbuff_err()
which are necessary for CIPSO to send error notifications to remote systems.
This patch re-introduces the error handling calls into the SELinux code.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Acked-by: James Morris <jmorris@namei.org>
---
 net/netlabel/netlabel_kapi.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 6c211fe9778..22faba620e4 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -490,6 +490,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
  * netlbl_skbuff_err - Handle a LSM error on a sk_buff
  * @skb: the packet
  * @error: the error code
+ * @gateway: true if host is acting as a gateway, false otherwise
  *
  * Description:
  * Deal with a LSM problem when handling the packet in @skb, typically this is
@@ -497,10 +498,10 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
  * according to the packet's labeling protocol.
  *
  */
-void netlbl_skbuff_err(struct sk_buff *skb, int error)
+void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
 {
 	if (CIPSO_V4_OPTEXIST(skb))
-		cipso_v4_error(skb, error, 0);
+		cipso_v4_error(skb, error, gateway);
 }
 
 /**
-- 
cgit v1.2.3


From b1edeb102397546438ab4624489c6ccd7b410d97 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:31 -0400
Subject: netlabel: Replace protocol/NetLabel linking with refrerence counts

NetLabel has always had a list of backpointers in the CIPSO DOI definition
structure which pointed to the NetLabel LSM domain mapping structures which
referenced the CIPSO DOI struct.  The rationale for this was that when an
administrator removed a CIPSO DOI from the system all of the associated
NetLabel LSM domain mappings should be removed as well; a list of
backpointers made this a simple operation.

Unfortunately, while the backpointers did make the removal easier they were
a bit of a mess from an implementation point of view which was making
further development difficult.  Since the removal of a CIPSO DOI is a
realtively rare event it seems to make sense to remove this backpointer
list as the optimization was hurting us more then it was helping.  However,
we still need to be able to track when a CIPSO DOI definition is being used
so replace the backpointer list with a reference count.  In order to
preserve the current functionality of removing the associated LSM domain
mappings when a CIPSO DOI is removed we walk the LSM domain mapping table,
removing the relevant entries.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 net/ipv4/cipso_ipv4.c              | 235 ++++++++++++++++---------------------
 net/netlabel/netlabel_cipso_v4.c   |  77 ++++++------
 net/netlabel/netlabel_domainhash.c |  95 ++++++++-------
 net/netlabel/netlabel_domainhash.h |   2 +
 net/netlabel/netlabel_kapi.c       |  43 ++++---
 net/netlabel/netlabel_mgmt.c       |  24 +---
 6 files changed, 221 insertions(+), 255 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 2c0e4572cc9..bf87eddfec3 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -47,17 +47,7 @@
 #include <asm/bug.h>
 #include <asm/unaligned.h>
 
-struct cipso_v4_domhsh_entry {
-	char *domain;
-	u32 valid;
-	struct list_head list;
-	struct rcu_head rcu;
-};
-
 /* List of available DOI definitions */
-/* XXX - Updates should be minimal so having a single lock for the
- * cipso_v4_doi_list and the cipso_v4_doi_list->dom_list should be
- * okay. */
 /* XXX - This currently assumes a minimal number of different DOIs in use,
  * if in practice there are a lot of different DOIs this list should
  * probably be turned into a hash table or something similar so we
@@ -193,25 +183,6 @@ static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
 		bitmap[byte_spot] &= ~bitmask;
 }
 
-/**
- * cipso_v4_doi_domhsh_free - Frees a domain list entry
- * @entry: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that the memory allocated to a domain list entry can be released
- * safely.
- *
- */
-static void cipso_v4_doi_domhsh_free(struct rcu_head *entry)
-{
-	struct cipso_v4_domhsh_entry *ptr;
-
-	ptr = container_of(entry, struct cipso_v4_domhsh_entry, rcu);
-	kfree(ptr->domain);
-	kfree(ptr);
-}
-
 /**
  * cipso_v4_cache_entry_free - Frees a cache entry
  * @entry: the entry to free
@@ -457,7 +428,7 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
 	struct cipso_v4_doi *iter;
 
 	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
-		if (iter->doi == doi && iter->valid)
+		if (iter->doi == doi && atomic_read(&iter->refcount))
 			return iter;
 	return NULL;
 }
@@ -501,9 +472,8 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def)
 		}
 	}
 
-	doi_def->valid = 1;
+	atomic_set(&doi_def->refcount, 1);
 	INIT_RCU_HEAD(&doi_def->rcu);
-	INIT_LIST_HEAD(&doi_def->dom_list);
 
 	spin_lock(&cipso_v4_doi_list_lock);
 	if (cipso_v4_doi_search(doi_def->doi) != NULL)
@@ -518,60 +488,130 @@ doi_add_failure:
 	return -EEXIST;
 }
 
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+	if (doi_def == NULL)
+		return;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_STD:
+		kfree(doi_def->map.std->lvl.cipso);
+		kfree(doi_def->map.std->lvl.local);
+		kfree(doi_def->map.std->cat.cipso);
+		kfree(doi_def->map.std->cat.local);
+		break;
+	}
+	kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+	struct cipso_v4_doi *doi_def;
+
+	doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+	cipso_v4_doi_free(doi_def);
+}
+
 /**
  * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
  * @doi: the DOI value
  * @audit_secid: the LSM secid to use in the audit message
- * @callback: the DOI cleanup/free callback
  *
  * Description:
- * Removes a DOI definition from the CIPSO engine, @callback is called to
- * free any memory.  The NetLabel routines will be called to release their own
- * LSM domain mappings as well as our own domain list.  Returns zero on
- * success and negative values on failure.
+ * Removes a DOI definition from the CIPSO engine.  The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list.  Returns zero on success and negative values on failure.
  *
  */
-int cipso_v4_doi_remove(u32 doi,
-			struct netlbl_audit *audit_info,
-			void (*callback) (struct rcu_head * head))
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
 {
 	struct cipso_v4_doi *doi_def;
-	struct cipso_v4_domhsh_entry *dom_iter;
 
 	spin_lock(&cipso_v4_doi_list_lock);
 	doi_def = cipso_v4_doi_search(doi);
-	if (doi_def != NULL) {
-		doi_def->valid = 0;
-		list_del_rcu(&doi_def->list);
+	if (doi_def == NULL) {
 		spin_unlock(&cipso_v4_doi_list_lock);
-		rcu_read_lock();
-		list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list)
-			if (dom_iter->valid)
-				netlbl_cfg_map_del(dom_iter->domain,
-						   audit_info);
-		rcu_read_unlock();
-		cipso_v4_cache_invalidate();
-		call_rcu(&doi_def->rcu, callback);
-		return 0;
+		return -ENOENT;
+	}
+	if (!atomic_dec_and_test(&doi_def->refcount)) {
+		spin_unlock(&cipso_v4_doi_list_lock);
+		return -EBUSY;
 	}
+	list_del_rcu(&doi_def->list);
 	spin_unlock(&cipso_v4_doi_list_lock);
 
-	return -ENOENT;
+	cipso_v4_cache_invalidate();
+	call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+
+	return 0;
 }
 
 /**
- * cipso_v4_doi_getdef - Returns a pointer to a valid DOI definition
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
  * @doi: the DOI value
  *
  * Description:
  * Searches for a valid DOI definition and if one is found it is returned to
  * the caller.  Otherwise NULL is returned.  The caller must ensure that
- * rcu_read_lock() is held while accessing the returned definition.
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
  *
  */
 struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
 {
-	return cipso_v4_doi_search(doi);
+	struct cipso_v4_doi *doi_def;
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_search(doi);
+	if (doi_def == NULL)
+		goto doi_getdef_return;
+	if (!atomic_inc_not_zero(&doi_def->refcount))
+		doi_def = NULL;
+
+doi_getdef_return:
+	rcu_read_unlock();
+	return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+	if (doi_def == NULL)
+		return;
+
+	if (!atomic_dec_and_test(&doi_def->refcount))
+		return;
+	spin_lock(&cipso_v4_doi_list_lock);
+	list_del_rcu(&doi_def->list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+
+	cipso_v4_cache_invalidate();
+	call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
 }
 
 /**
@@ -597,7 +637,7 @@ int cipso_v4_doi_walk(u32 *skip_cnt,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
-		if (iter_doi->valid) {
+		if (atomic_read(&iter_doi->refcount) > 0) {
 			if (doi_cnt++ < *skip_cnt)
 				continue;
 			ret_val = callback(iter_doi, cb_arg);
@@ -613,85 +653,6 @@ doi_walk_return:
 	return ret_val;
 }
 
-/**
- * cipso_v4_doi_domhsh_add - Adds a domain entry to a DOI definition
- * @doi_def: the DOI definition
- * @domain: the domain to add
- *
- * Description:
- * Adds the @domain to the DOI specified by @doi_def, this function
- * should only be called by external functions (i.e. NetLabel).  This function
- * does allocate memory.  Returns zero on success, negative values on failure.
- *
- */
-int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain)
-{
-	struct cipso_v4_domhsh_entry *iter;
-	struct cipso_v4_domhsh_entry *new_dom;
-
-	new_dom = kzalloc(sizeof(*new_dom), GFP_KERNEL);
-	if (new_dom == NULL)
-		return -ENOMEM;
-	if (domain) {
-		new_dom->domain = kstrdup(domain, GFP_KERNEL);
-		if (new_dom->domain == NULL) {
-			kfree(new_dom);
-			return -ENOMEM;
-		}
-	}
-	new_dom->valid = 1;
-	INIT_RCU_HEAD(&new_dom->rcu);
-
-	spin_lock(&cipso_v4_doi_list_lock);
-	list_for_each_entry(iter, &doi_def->dom_list, list)
-		if (iter->valid &&
-		    ((domain != NULL && iter->domain != NULL &&
-		      strcmp(iter->domain, domain) == 0) ||
-		     (domain == NULL && iter->domain == NULL))) {
-			spin_unlock(&cipso_v4_doi_list_lock);
-			kfree(new_dom->domain);
-			kfree(new_dom);
-			return -EEXIST;
-		}
-	list_add_tail_rcu(&new_dom->list, &doi_def->dom_list);
-	spin_unlock(&cipso_v4_doi_list_lock);
-
-	return 0;
-}
-
-/**
- * cipso_v4_doi_domhsh_remove - Removes a domain entry from a DOI definition
- * @doi_def: the DOI definition
- * @domain: the domain to remove
- *
- * Description:
- * Removes the @domain from the DOI specified by @doi_def, this function
- * should only be called by external functions (i.e. NetLabel).   Returns zero
- * on success and negative values on error.
- *
- */
-int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def,
-			       const char *domain)
-{
-	struct cipso_v4_domhsh_entry *iter;
-
-	spin_lock(&cipso_v4_doi_list_lock);
-	list_for_each_entry(iter, &doi_def->dom_list, list)
-		if (iter->valid &&
-		    ((domain != NULL && iter->domain != NULL &&
-		      strcmp(iter->domain, domain) == 0) ||
-		     (domain == NULL && iter->domain == NULL))) {
-			iter->valid = 0;
-			list_del_rcu(&iter->list);
-			spin_unlock(&cipso_v4_doi_list_lock);
-			call_rcu(&iter->rcu, cipso_v4_doi_domhsh_free);
-			return 0;
-		}
-	spin_unlock(&cipso_v4_doi_list_lock);
-
-	return -ENOENT;
-}
-
 /*
  * Label Mapping Functions
  */
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index aaf50032b3a..5c4f60bbc82 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -43,6 +43,7 @@
 #include "netlabel_user.h"
 #include "netlabel_cipso_v4.h"
 #include "netlabel_mgmt.h"
+#include "netlabel_domainhash.h"
 
 /* Argument struct for cipso_v4_doi_walk() */
 struct netlbl_cipsov4_doiwalk_arg {
@@ -51,6 +52,12 @@ struct netlbl_cipsov4_doiwalk_arg {
 	u32 seq;
 };
 
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+	struct netlbl_audit *audit_info;
+	u32 doi;
+};
+
 /* NetLabel Generic NETLINK CIPSOv4 family */
 static struct genl_family netlbl_cipsov4_gnl_family = {
 	.id = GENL_ID_GENERATE,
@@ -80,32 +87,6 @@ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1
  * Helper Functions
  */
 
-/**
- * netlbl_cipsov4_doi_free - Frees a CIPSO V4 DOI definition
- * @entry: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that the memory allocated to the DOI definition can be released
- * safely.
- *
- */
-void netlbl_cipsov4_doi_free(struct rcu_head *entry)
-{
-	struct cipso_v4_doi *ptr;
-
-	ptr = container_of(entry, struct cipso_v4_doi, rcu);
-	switch (ptr->type) {
-	case CIPSO_V4_MAP_STD:
-		kfree(ptr->map.std->lvl.cipso);
-		kfree(ptr->map.std->lvl.local);
-		kfree(ptr->map.std->cat.cipso);
-		kfree(ptr->map.std->cat.local);
-		break;
-	}
-	kfree(ptr);
-}
-
 /**
  * netlbl_cipsov4_add_common - Parse the common sections of a ADD message
  * @info: the Generic NETLINK info block
@@ -342,7 +323,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info)
 
 add_std_failure:
 	if (doi_def)
-		netlbl_cipsov4_doi_free(&doi_def->rcu);
+		cipso_v4_doi_free(doi_def);
 	return ret_val;
 }
 
@@ -379,7 +360,7 @@ static int netlbl_cipsov4_add_pass(struct genl_info *info)
 	return 0;
 
 add_pass_failure:
-	netlbl_cipsov4_doi_free(&doi_def->rcu);
+	cipso_v4_doi_free(doi_def);
 	return ret_val;
 }
 
@@ -667,6 +648,29 @@ static int netlbl_cipsov4_listall(struct sk_buff *skb,
 	return skb->len;
 }
 
+/**
+ * netlbl_cipsov4_remove_cb - netlbl_cipsov4_remove() callback for REMOVE
+ * @entry: LSM domain mapping entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is intended for use by netlbl_cipsov4_remove() as the callback
+ * for the netlbl_domhsh_walk() function; it removes LSM domain map entries
+ * which are associated with the CIPSO DOI specified in @arg.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_remove_cb(struct netlbl_dom_map *entry, void *arg)
+{
+	struct netlbl_domhsh_walk_arg *cb_arg = arg;
+
+	if (entry->type == NETLBL_NLTYPE_CIPSOV4 &&
+	    entry->type_def.cipsov4->doi == cb_arg->doi)
+		return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info);
+
+	return 0;
+}
+
 /**
  * netlbl_cipsov4_remove - Handle a REMOVE message
  * @skb: the NETLINK buffer
@@ -681,8 +685,11 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
 {
 	int ret_val = -EINVAL;
 	u32 doi = 0;
+	struct netlbl_domhsh_walk_arg cb_arg;
 	struct audit_buffer *audit_buf;
 	struct netlbl_audit audit_info;
+	u32 skip_bkt = 0;
+	u32 skip_chain = 0;
 
 	if (!info->attrs[NLBL_CIPSOV4_A_DOI])
 		return -EINVAL;
@@ -690,11 +697,15 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
 	doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
 	netlbl_netlink_auditinfo(skb, &audit_info);
 
-	ret_val = cipso_v4_doi_remove(doi,
-				      &audit_info,
-				      netlbl_cipsov4_doi_free);
-	if (ret_val == 0)
-		atomic_dec(&netlabel_mgmt_protocount);
+	cb_arg.doi = doi;
+	cb_arg.audit_info = &audit_info;
+	ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain,
+				     netlbl_cipsov4_remove_cb, &cb_arg);
+	if (ret_val == 0 || ret_val == -ENOENT) {
+		ret_val = cipso_v4_doi_remove(doi, &audit_info);
+		if (ret_val == 0)
+			atomic_dec(&netlabel_mgmt_protocount);
+	}
 
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL,
 					      &audit_info);
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index dc42206c431..0243f0c57b4 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -217,20 +217,6 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 	u32 bkt;
 	struct audit_buffer *audit_buf;
 
-	switch (entry->type) {
-	case NETLBL_NLTYPE_UNLABELED:
-		ret_val = 0;
-		break;
-	case NETLBL_NLTYPE_CIPSOV4:
-		ret_val = cipso_v4_doi_domhsh_add(entry->type_def.cipsov4,
-						  entry->domain);
-		break;
-	default:
-		return -EINVAL;
-	}
-	if (ret_val != 0)
-		return ret_val;
-
 	entry->valid = 1;
 	INIT_RCU_HEAD(&entry->rcu);
 
@@ -271,16 +257,6 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 	}
 	rcu_read_unlock();
 
-	if (ret_val != 0) {
-		switch (entry->type) {
-		case NETLBL_NLTYPE_CIPSOV4:
-			if (cipso_v4_doi_domhsh_remove(entry->type_def.cipsov4,
-						       entry->domain) != 0)
-				BUG();
-			break;
-		}
-	}
-
 	return ret_val;
 }
 
@@ -302,35 +278,26 @@ int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
 }
 
 /**
- * netlbl_domhsh_remove - Removes an entry from the domain hash table
- * @domain: the domain to remove
+ * netlbl_domhsh_remove_entry - Removes a given entry from the domain table
+ * @entry: the entry to remove
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Removes an entry from the domain hash table and handles any updates to the
- * lower level protocol handler (i.e. CIPSO).  Returns zero on success,
- * negative on failure.
+ * lower level protocol handler (i.e. CIPSO).  Caller is responsible for
+ * ensuring that the RCU read lock is held.  Returns zero on success, negative
+ * on failure.
  *
  */
-int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
+int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
+			       struct netlbl_audit *audit_info)
 {
-	int ret_val = -ENOENT;
-	struct netlbl_dom_map *entry;
+	int ret_val = 0;
 	struct audit_buffer *audit_buf;
 
-	rcu_read_lock();
-	if (domain)
-		entry = netlbl_domhsh_search(domain);
-	else
-		entry = netlbl_domhsh_search_def(domain);
 	if (entry == NULL)
-		goto remove_return;
-	switch (entry->type) {
-	case NETLBL_NLTYPE_CIPSOV4:
-		cipso_v4_doi_domhsh_remove(entry->type_def.cipsov4,
-					   entry->domain);
-		break;
-	}
+		return -ENOENT;
+
 	spin_lock(&netlbl_domhsh_lock);
 	if (entry->valid) {
 		entry->valid = 0;
@@ -338,8 +305,8 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
 			list_del_rcu(&entry->list);
 		else
 			rcu_assign_pointer(netlbl_domhsh_def, NULL);
-		ret_val = 0;
-	}
+	} else
+		ret_val = -ENOENT;
 	spin_unlock(&netlbl_domhsh_lock);
 
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
@@ -351,10 +318,42 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
 		audit_log_end(audit_buf);
 	}
 
-remove_return:
-	rcu_read_unlock();
-	if (ret_val == 0)
+	if (ret_val == 0) {
+		switch (entry->type) {
+		case NETLBL_NLTYPE_CIPSOV4:
+			cipso_v4_doi_putdef(entry->type_def.cipsov4);
+			break;
+		}
 		call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
+	}
+
+	return ret_val;
+}
+
+/**
+ * netlbl_domhsh_remove - Removes an entry from the domain hash table
+ * @domain: the domain to remove
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an entry from the domain hash table and handles any updates to the
+ * lower level protocol handler (i.e. CIPSO).  Returns zero on success,
+ * negative on failure.
+ *
+ */
+int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct netlbl_dom_map *entry;
+
+	rcu_read_lock();
+	if (domain)
+		entry = netlbl_domhsh_search(domain);
+	else
+		entry = netlbl_domhsh_search_def(domain);
+	ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
+	rcu_read_unlock();
+
 	return ret_val;
 }
 
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 8220990ceb9..afcc41a7432 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -61,6 +61,8 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 		      struct netlbl_audit *audit_info);
 int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
 			      struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
+			       struct netlbl_audit *audit_info);
 int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
 int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 22faba620e4..7d8ecea9391 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -121,10 +121,15 @@ int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
 			       struct netlbl_audit *audit_info)
 {
 	int ret_val = -ENOMEM;
+	u32 doi;
+	u32 doi_type;
 	struct netlbl_dom_map *entry;
 	const char *type_str;
 	struct audit_buffer *audit_buf;
 
+	doi = doi_def->doi;
+	doi_type = doi_def->type;
+
 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL)
 		return -ENOMEM;
@@ -133,32 +138,25 @@ int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
 		if (entry->domain == NULL)
 			goto cfg_cipsov4_add_map_failure;
 	}
-	entry->type = NETLBL_NLTYPE_CIPSOV4;
-	entry->type_def.cipsov4 = doi_def;
-
-	/* Grab a RCU read lock here so nothing happens to the doi_def variable
-	 * between adding it to the CIPSOv4 protocol engine and adding a
-	 * domain mapping for it. */
 
-	rcu_read_lock();
 	ret_val = cipso_v4_doi_add(doi_def);
 	if (ret_val != 0)
-		goto cfg_cipsov4_add_map_failure_unlock;
+		goto cfg_cipsov4_add_map_failure_remove_doi;
+	entry->type = NETLBL_NLTYPE_CIPSOV4;
+	entry->type_def.cipsov4 = cipso_v4_doi_getdef(doi);
+	if (entry->type_def.cipsov4 == NULL) {
+		ret_val = -ENOENT;
+		goto cfg_cipsov4_add_map_failure_remove_doi;
+	}
 	ret_val = netlbl_domhsh_add(entry, audit_info);
 	if (ret_val != 0)
-		goto cfg_cipsov4_add_map_failure_remove_doi;
-	rcu_read_unlock();
-
-	return 0;
+		goto cfg_cipsov4_add_map_failure_release_doi;
 
-cfg_cipsov4_add_map_failure_remove_doi:
-	cipso_v4_doi_remove(doi_def->doi, audit_info, netlbl_cipsov4_doi_free);
-cfg_cipsov4_add_map_failure_unlock:
-	rcu_read_unlock();
+cfg_cipsov4_add_map_return:
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
 					      audit_info);
 	if (audit_buf != NULL) {
-		switch (doi_def->type) {
+		switch (doi_type) {
 		case CIPSO_V4_MAP_STD:
 			type_str = "std";
 			break;
@@ -170,14 +168,21 @@ cfg_cipsov4_add_map_failure_unlock:
 		}
 		audit_log_format(audit_buf,
 				 " cipso_doi=%u cipso_type=%s res=%u",
-				 doi_def->doi, type_str, ret_val == 0 ? 1 : 0);
+				 doi, type_str, ret_val == 0 ? 1 : 0);
 		audit_log_end(audit_buf);
 	}
+
+	return ret_val;
+
+cfg_cipsov4_add_map_failure_release_doi:
+	cipso_v4_doi_putdef(doi_def);
+cfg_cipsov4_add_map_failure_remove_doi:
+	cipso_v4_doi_remove(doi, audit_info);
 cfg_cipsov4_add_map_failure:
 	if (entry != NULL)
 		kfree(entry->domain);
 	kfree(entry);
-	return ret_val;
+	goto cfg_cipsov4_add_map_return;
 }
 
 /*
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 44be5d5261f..c4e18c7bc0c 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -122,18 +122,12 @@ static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info)
 			goto add_failure;
 
 		tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
-		/* We should be holding a rcu_read_lock() here while we hold
-		 * the result but since the entry will always be deleted when
-		 * the CIPSO DOI is deleted we aren't going to keep the
-		 * lock. */
-		rcu_read_lock();
 		entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val);
-		if (entry->type_def.cipsov4 == NULL) {
-			rcu_read_unlock();
+		if (entry->type_def.cipsov4 == NULL)
 			goto add_failure;
-		}
 		ret_val = netlbl_domhsh_add(entry, &audit_info);
-		rcu_read_unlock();
+		if (ret_val != 0)
+			cipso_v4_doi_putdef(entry->type_def.cipsov4);
 		break;
 	default:
 		goto add_failure;
@@ -294,18 +288,12 @@ static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info)
 			goto adddef_failure;
 
 		tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
-		/* We should be holding a rcu_read_lock() here while we hold
-		 * the result but since the entry will always be deleted when
-		 * the CIPSO DOI is deleted we aren't going to keep the
-		 * lock. */
-		rcu_read_lock();
 		entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val);
-		if (entry->type_def.cipsov4 == NULL) {
-			rcu_read_unlock();
+		if (entry->type_def.cipsov4 == NULL)
 			goto adddef_failure;
-		}
 		ret_val = netlbl_domhsh_add_default(entry, &audit_info);
-		rcu_read_unlock();
+		if (ret_val != 0)
+			cipso_v4_doi_putdef(entry->type_def.cipsov4);
 		break;
 	default:
 		goto adddef_failure;
-- 
cgit v1.2.3


From 61e1068219950c672ce979719ad2be3aadb00d7d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:32 -0400
Subject: netlabel: Add a generic way to create ordered linked lists of network
 addrs

Create an ordered IP address linked list mechanism similar to the core
kernel's linked list construct.  The idea behind this list functionality
is to create an extensibile linked list ordered by IP address mask to
ease the matching of network addresses.  The linked list is ordered with
larger address masks at the front of the list and shorter address masks
at the end to facilitate overriding network entries with individual host
or subnet entries.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 net/netlabel/Makefile             |   3 +-
 net/netlabel/netlabel_addrlist.c  | 258 ++++++++++++++++++++++++++++
 net/netlabel/netlabel_addrlist.h  | 174 +++++++++++++++++++
 net/netlabel/netlabel_unlabeled.c | 350 +++++++++++++++-----------------------
 4 files changed, 569 insertions(+), 216 deletions(-)
 create mode 100644 net/netlabel/netlabel_addrlist.c
 create mode 100644 net/netlabel/netlabel_addrlist.h

(limited to 'net')

diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile
index 8af18c0a47d..ea750e9df65 100644
--- a/net/netlabel/Makefile
+++ b/net/netlabel/Makefile
@@ -5,7 +5,8 @@
 #
 
 # base objects
-obj-y	:= netlabel_user.o netlabel_kapi.o netlabel_domainhash.o
+obj-y	:= netlabel_user.o netlabel_kapi.o
+obj-y	+= netlabel_domainhash.o netlabel_addrlist.o
 
 # management objects
 obj-y	+= netlabel_mgmt.o
diff --git a/net/netlabel/netlabel_addrlist.c b/net/netlabel/netlabel_addrlist.c
new file mode 100644
index 00000000000..dd928aa52db
--- /dev/null
+++ b/net/netlabel/netlabel_addrlist.c
@@ -0,0 +1,258 @@
+/*
+ * NetLabel Network Address Lists
+ *
+ * This file contains network address list functions used to manage ordered
+ * lists of network addresses for use by the NetLabel subsystem.  The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include "netlabel_addrlist.h"
+
+/*
+ * Address List Functions
+ */
+
+/**
+ * netlbl_af4list_search - Search for a matching IPv4 address entry
+ * @addr: IPv4 address
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv4 address list given by @head.  If a matching address entry
+ * is found it is returned, otherwise NULL is returned.  The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
+					     struct list_head *head)
+{
+	struct netlbl_af4list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid && (addr & iter->mask) == iter->addr)
+			return iter;
+
+	return NULL;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_search - Search for a matching IPv6 address entry
+ * @addr: IPv6 address
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv6 address list given by @head.  If a matching address entry
+ * is found it is returned, otherwise NULL is returned.  The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
+					     struct list_head *head)
+{
+	struct netlbl_af6list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
+			return iter;
+
+	return NULL;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_af4list_add - Add a new IPv4 address entry to a list
+ * @entry: address entry
+ * @head: the list head
+ *
+ * Description:
+ * Add a new address entry to the list pointed to by @head.  On success zero is
+ * returned, otherwise a negative value is returned.  The caller is responsible
+ * for calling the necessary locking functions.
+ *
+ */
+int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head)
+{
+	struct netlbl_af4list *iter;
+
+	iter = netlbl_af4list_search(entry->addr, head);
+	if (iter != NULL &&
+	    iter->addr == entry->addr && iter->mask == entry->mask)
+		return -EEXIST;
+
+	/* in order to speed up address searches through the list (the common
+	 * case) we need to keep the list in order based on the size of the
+	 * address mask such that the entry with the widest mask (smallest
+	 * numerical value) appears first in the list */
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ntohl(entry->mask) > ntohl(iter->mask)) {
+			__list_add_rcu(&entry->list,
+				       iter->list.prev,
+				       &iter->list);
+			return 0;
+		}
+	list_add_tail_rcu(&entry->list, head);
+	return 0;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_add - Add a new IPv6 address entry to a list
+ * @entry: address entry
+ * @head: the list head
+ *
+ * Description:
+ * Add a new address entry to the list pointed to by @head.  On success zero is
+ * returned, otherwise a negative value is returned.  The caller is responsible
+ * for calling the necessary locking functions.
+ *
+ */
+int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head)
+{
+	struct netlbl_af6list *iter;
+
+	iter = netlbl_af6list_search(&entry->addr, head);
+	if (iter != NULL &&
+	    ipv6_addr_equal(&iter->addr, &entry->addr) &&
+	    ipv6_addr_equal(&iter->mask, &entry->mask))
+		return -EEXIST;
+
+	/* in order to speed up address searches through the list (the common
+	 * case) we need to keep the list in order based on the size of the
+	 * address mask such that the entry with the widest mask (smallest
+	 * numerical value) appears first in the list */
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
+			__list_add_rcu(&entry->list,
+				       iter->list.prev,
+				       &iter->list);
+			return 0;
+		}
+	list_add_tail_rcu(&entry->list, head);
+	return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_af4list_remove_entry - Remove an IPv4 address entry
+ * @entry: address entry
+ *
+ * Description:
+ * Remove the specified IP address entry.  The caller is responsible for
+ * calling the necessary locking functions.
+ *
+ */
+void netlbl_af4list_remove_entry(struct netlbl_af4list *entry)
+{
+	entry->valid = 0;
+	list_del_rcu(&entry->list);
+}
+
+/**
+ * netlbl_af4list_remove - Remove an IPv4 address entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @head: the list head
+ *
+ * Description:
+ * Remove an IP address entry from the list pointed to by @head.  Returns the
+ * entry on success, NULL on failure.  The caller is responsible for calling
+ * the necessary locking functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
+					     struct list_head *head)
+{
+	struct netlbl_af4list *entry;
+
+	entry = netlbl_af4list_search(addr, head);
+	if (entry != NULL && entry->addr == addr && entry->mask == mask) {
+		netlbl_af4list_remove_entry(entry);
+		return entry;
+	}
+
+	return NULL;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_remove_entry - Remove an IPv6 address entry
+ * @entry: address entry
+ *
+ * Description:
+ * Remove the specified IP address entry.  The caller is responsible for
+ * calling the necessary locking functions.
+ *
+ */
+void netlbl_af6list_remove_entry(struct netlbl_af6list *entry)
+{
+	entry->valid = 0;
+	list_del_rcu(&entry->list);
+}
+
+/**
+ * netlbl_af6list_remove - Remove an IPv6 address entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @head: the list head
+ *
+ * Description:
+ * Remove an IP address entry from the list pointed to by @head.  Returns the
+ * entry on success, NULL on failure.  The caller is responsible for calling
+ * the necessary locking functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
+					     const struct in6_addr *mask,
+					     struct list_head *head)
+{
+	struct netlbl_af6list *entry;
+
+	entry = netlbl_af6list_search(addr, head);
+	if (entry != NULL &&
+	    ipv6_addr_equal(&entry->addr, addr) &&
+	    ipv6_addr_equal(&entry->mask, mask)) {
+		netlbl_af6list_remove_entry(entry);
+		return entry;
+	}
+
+	return NULL;
+}
+#endif /* IPv6 */
diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h
new file mode 100644
index 00000000000..0c41df057fa
--- /dev/null
+++ b/net/netlabel/netlabel_addrlist.h
@@ -0,0 +1,174 @@
+/*
+ * NetLabel Network Address Lists
+ *
+ * This file contains network address list functions used to manage ordered
+ * lists of network addresses for use by the NetLabel subsystem.  The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_ADDRLIST_H
+#define _NETLABEL_ADDRLIST_H
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/in6.h>
+
+/**
+ * struct netlbl_af4list - NetLabel IPv4 address list
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @valid: valid flag
+ * @list: list structure, used internally
+ */
+struct netlbl_af4list {
+	__be32 addr;
+	__be32 mask;
+
+	u32 valid;
+	struct list_head list;
+};
+
+/**
+ * struct netlbl_af6list - NetLabel IPv6 address list
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @valid: valid flag
+ * @list: list structure, used internally
+ */
+struct netlbl_af6list {
+	struct in6_addr addr;
+	struct in6_addr mask;
+
+	u32 valid;
+	struct list_head list;
+};
+
+#define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list)
+
+static inline struct netlbl_af4list *__af4list_valid(struct list_head *s,
+						     struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af4list *n = __af4list_entry(s);
+	while (i != h && !n->valid) {
+		i = i->next;
+		n = __af4list_entry(i);
+	}
+	return n;
+}
+
+static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s,
+							 struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af4list *n = __af4list_entry(s);
+	while (i != h && !n->valid) {
+		i = rcu_dereference(i->next);
+		n = __af4list_entry(i);
+	}
+	return n;
+}
+
+#define netlbl_af4list_foreach(iter, head)				\
+	for (iter = __af4list_valid((head)->next, head);		\
+	     prefetch(iter->list.next), &iter->list != (head);		\
+	     iter = __af4list_valid(iter->list.next, head))
+
+#define netlbl_af4list_foreach_rcu(iter, head)				\
+	for (iter = __af4list_valid_rcu((head)->next, head);		\
+	     prefetch(iter->list.next),	&iter->list != (head);		\
+	     iter = __af4list_valid_rcu(iter->list.next, head))
+
+#define netlbl_af4list_foreach_safe(iter, tmp, head)			\
+	for (iter = __af4list_valid((head)->next, head),		\
+		     tmp = __af4list_valid(iter->list.next, head);	\
+	     &iter->list != (head);					\
+	     iter = tmp, tmp = __af4list_valid(iter->list.next, head))
+
+int netlbl_af4list_add(struct netlbl_af4list *entry,
+		       struct list_head *head);
+struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
+					     struct list_head *head);
+void netlbl_af4list_remove_entry(struct netlbl_af4list *entry);
+struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
+					     struct list_head *head);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+#define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list)
+
+static inline struct netlbl_af6list *__af6list_valid(struct list_head *s,
+						     struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af6list *n = __af6list_entry(s);
+	while (i != h && !n->valid) {
+		i = i->next;
+		n = __af6list_entry(i);
+	}
+	return n;
+}
+
+static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s,
+							 struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af6list *n = __af6list_entry(s);
+	while (i != h && !n->valid) {
+		i = rcu_dereference(i->next);
+		n = __af6list_entry(i);
+	}
+	return n;
+}
+
+#define netlbl_af6list_foreach(iter, head)				\
+	for (iter = __af6list_valid((head)->next, head);		\
+	     prefetch(iter->list.next),	&iter->list != (head);		\
+	     iter = __af6list_valid(iter->list.next, head))
+
+#define netlbl_af6list_foreach_rcu(iter, head)				\
+	for (iter = __af6list_valid_rcu((head)->next, head);		\
+	     prefetch(iter->list.next),	&iter->list != (head);		\
+	     iter = __af6list_valid_rcu(iter->list.next, head))
+
+#define netlbl_af6list_foreach_safe(iter, tmp, head)			\
+	for (iter = __af6list_valid((head)->next, head),		\
+		     tmp = __af6list_valid(iter->list.next, head);	\
+	     &iter->list != (head);					\
+	     iter = tmp, tmp = __af6list_valid(iter->list.next, head))
+
+int netlbl_af6list_add(struct netlbl_af6list *entry,
+		       struct list_head *head);
+struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
+					     const struct in6_addr *mask,
+					     struct list_head *head);
+void netlbl_af6list_remove_entry(struct netlbl_af6list *entry);
+struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
+					     struct list_head *head);
+#endif /* IPV6 */
+
+#endif
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index cc105a10e3f..ab8131a8e48 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -54,6 +54,7 @@
 #include <asm/atomic.h>
 
 #include "netlabel_user.h"
+#include "netlabel_addrlist.h"
 #include "netlabel_domainhash.h"
 #include "netlabel_unlabeled.h"
 #include "netlabel_mgmt.h"
@@ -76,22 +77,20 @@ struct netlbl_unlhsh_tbl {
 	struct list_head *tbl;
 	u32 size;
 };
+#define netlbl_unlhsh_addr4_entry(iter) \
+	container_of(iter, struct netlbl_unlhsh_addr4, list)
 struct netlbl_unlhsh_addr4 {
-	__be32 addr;
-	__be32 mask;
 	u32 secid;
 
-	u32 valid;
-	struct list_head list;
+	struct netlbl_af4list list;
 	struct rcu_head rcu;
 };
+#define netlbl_unlhsh_addr6_entry(iter) \
+	container_of(iter, struct netlbl_unlhsh_addr6, list)
 struct netlbl_unlhsh_addr6 {
-	struct in6_addr addr;
-	struct in6_addr mask;
 	u32 secid;
 
-	u32 valid;
-	struct list_head list;
+	struct netlbl_af6list list;
 	struct rcu_head rcu;
 };
 struct netlbl_unlhsh_iface {
@@ -274,26 +273,28 @@ static void netlbl_unlhsh_free_addr6(struct rcu_head *entry)
 static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
 {
 	struct netlbl_unlhsh_iface *iface;
-	struct netlbl_unlhsh_addr4 *iter4;
-	struct netlbl_unlhsh_addr4 *tmp4;
-	struct netlbl_unlhsh_addr6 *iter6;
-	struct netlbl_unlhsh_addr6 *tmp6;
+	struct netlbl_af4list *iter4;
+	struct netlbl_af4list *tmp4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+	struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
 
 	iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
 
 	/* no need for locks here since we are the only one with access to this
 	 * structure */
 
-	list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list)
-		if (iter4->valid) {
-			list_del_rcu(&iter4->list);
-			kfree(iter4);
-		}
-	list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list)
-		if (iter6->valid) {
-			list_del_rcu(&iter6->list);
-			kfree(iter6);
-		}
+	netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) {
+		netlbl_af4list_remove_entry(iter4);
+		kfree(netlbl_unlhsh_addr4_entry(iter4));
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) {
+		netlbl_af6list_remove_entry(iter6);
+		kfree(netlbl_unlhsh_addr6_entry(iter6));
+	}
+#endif /* IPv6 */
 	kfree(iface);
 }
 
@@ -315,59 +316,6 @@ static u32 netlbl_unlhsh_hash(int ifindex)
 	return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1);
 }
 
-/**
- * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry
- * @addr: IPv4 address
- * @iface: the network interface entry
- *
- * Description:
- * Searches the IPv4 address list of the network interface specified by @iface.
- * If a matching address entry is found it is returned, otherwise NULL is
- * returned.  The caller is responsible for calling the rcu_read_[un]lock()
- * functions.
- *
- */
-static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4(
-	                               __be32 addr,
-	                               const struct netlbl_unlhsh_iface *iface)
-{
-	struct netlbl_unlhsh_addr4 *iter;
-
-	list_for_each_entry_rcu(iter, &iface->addr4_list, list)
-		if (iter->valid && (addr & iter->mask) == iter->addr)
-			return iter;
-
-	return NULL;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/**
- * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry
- * @addr: IPv6 address
- * @iface: the network interface entry
- *
- * Description:
- * Searches the IPv6 address list of the network interface specified by @iface.
- * If a matching address entry is found it is returned, otherwise NULL is
- * returned.  The caller is responsible for calling the rcu_read_[un]lock()
- * functions.
- *
- */
-static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6(
-	                               const struct in6_addr *addr,
-	                               const struct netlbl_unlhsh_iface *iface)
-{
-	struct netlbl_unlhsh_addr6 *iter;
-
-	list_for_each_entry_rcu(iter, &iface->addr6_list, list)
-		if (iter->valid &&
-		    ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
-		return iter;
-
-	return NULL;
-}
-#endif /* IPv6 */
-
 /**
  * netlbl_unlhsh_search_iface - Search for a matching interface entry
  * @ifindex: the network interface
@@ -439,43 +387,26 @@ static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
 				   const struct in_addr *mask,
 				   u32 secid)
 {
+	int ret_val;
 	struct netlbl_unlhsh_addr4 *entry;
-	struct netlbl_unlhsh_addr4 *iter;
 
 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL)
 		return -ENOMEM;
 
-	entry->addr = addr->s_addr & mask->s_addr;
-	entry->mask = mask->s_addr;
-	entry->secid = secid;
-	entry->valid = 1;
+	entry->list.addr = addr->s_addr & mask->s_addr;
+	entry->list.mask = mask->s_addr;
+	entry->list.valid = 1;
 	INIT_RCU_HEAD(&entry->rcu);
+	entry->secid = secid;
 
 	spin_lock(&netlbl_unlhsh_lock);
-	iter = netlbl_unlhsh_search_addr4(entry->addr, iface);
-	if (iter != NULL &&
-	    iter->addr == addr->s_addr && iter->mask == mask->s_addr) {
-		spin_unlock(&netlbl_unlhsh_lock);
-		kfree(entry);
-		return -EEXIST;
-	}
-	/* in order to speed up address searches through the list (the common
-	 * case) we need to keep the list in order based on the size of the
-	 * address mask such that the entry with the widest mask (smallest
-	 * numerical value) appears first in the list */
-	list_for_each_entry_rcu(iter, &iface->addr4_list, list)
-		if (iter->valid &&
-		    ntohl(entry->mask) > ntohl(iter->mask)) {
-			__list_add_rcu(&entry->list,
-				       iter->list.prev,
-				       &iter->list);
-			spin_unlock(&netlbl_unlhsh_lock);
-			return 0;
-		}
-	list_add_tail_rcu(&entry->list, &iface->addr4_list);
+	ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list);
 	spin_unlock(&netlbl_unlhsh_lock);
-	return 0;
+
+	if (ret_val != 0)
+		kfree(entry);
+	return ret_val;
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -498,47 +429,29 @@ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
 				   const struct in6_addr *mask,
 				   u32 secid)
 {
+	int ret_val;
 	struct netlbl_unlhsh_addr6 *entry;
-	struct netlbl_unlhsh_addr6 *iter;
 
 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL)
 		return -ENOMEM;
 
-	ipv6_addr_copy(&entry->addr, addr);
-	entry->addr.s6_addr32[0] &= mask->s6_addr32[0];
-	entry->addr.s6_addr32[1] &= mask->s6_addr32[1];
-	entry->addr.s6_addr32[2] &= mask->s6_addr32[2];
-	entry->addr.s6_addr32[3] &= mask->s6_addr32[3];
-	ipv6_addr_copy(&entry->mask, mask);
-	entry->secid = secid;
-	entry->valid = 1;
+	ipv6_addr_copy(&entry->list.addr, addr);
+	entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+	entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+	entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+	entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+	ipv6_addr_copy(&entry->list.mask, mask);
+	entry->list.valid = 1;
 	INIT_RCU_HEAD(&entry->rcu);
+	entry->secid = secid;
 
 	spin_lock(&netlbl_unlhsh_lock);
-	iter = netlbl_unlhsh_search_addr6(&entry->addr, iface);
-	if (iter != NULL &&
-	    (ipv6_addr_equal(&iter->addr, addr) &&
-	     ipv6_addr_equal(&iter->mask, mask))) {
-		spin_unlock(&netlbl_unlhsh_lock);
-		kfree(entry);
-		return -EEXIST;
-	}
-	/* in order to speed up address searches through the list (the common
-	 * case) we need to keep the list in order based on the size of the
-	 * address mask such that the entry with the widest mask (smallest
-	 * numerical value) appears first in the list */
-	list_for_each_entry_rcu(iter, &iface->addr6_list, list)
-		if (iter->valid &&
-		    ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
-			__list_add_rcu(&entry->list,
-				       iter->list.prev,
-				       &iter->list);
-			spin_unlock(&netlbl_unlhsh_lock);
-			return 0;
-		}
-	list_add_tail_rcu(&entry->list, &iface->addr6_list);
+	ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list);
 	spin_unlock(&netlbl_unlhsh_lock);
+
+	if (ret_val != 0)
+		kfree(entry);
 	return 0;
 }
 #endif /* IPv6 */
@@ -719,22 +632,21 @@ static int netlbl_unlhsh_remove_addr4(struct net *net,
 				      const struct in_addr *mask,
 				      struct netlbl_audit *audit_info)
 {
-	int ret_val = -ENOENT;
+	int ret_val = 0;
+	struct netlbl_af4list *list_entry;
 	struct netlbl_unlhsh_addr4 *entry;
-	struct audit_buffer *audit_buf = NULL;
+	struct audit_buffer *audit_buf;
 	struct net_device *dev;
-	char *secctx = NULL;
+	char *secctx;
 	u32 secctx_len;
 
 	spin_lock(&netlbl_unlhsh_lock);
-	entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface);
-	if (entry != NULL &&
-	    entry->addr == addr->s_addr && entry->mask == mask->s_addr) {
-		entry->valid = 0;
-		list_del_rcu(&entry->list);
-		ret_val = 0;
-	}
+	list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
+					   &iface->addr4_list);
 	spin_unlock(&netlbl_unlhsh_lock);
+	if (list_entry == NULL)
+		ret_val = -ENOENT;
+	entry = netlbl_unlhsh_addr4_entry(list_entry);
 
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
 					      audit_info);
@@ -742,12 +654,12 @@ static int netlbl_unlhsh_remove_addr4(struct net *net,
 		dev = dev_get_by_index(net, iface->ifindex);
 		netlbl_unlabel_audit_addr4(audit_buf,
 					   (dev != NULL ? dev->name : NULL),
-					   entry->addr, entry->mask);
+					   addr->s_addr, mask->s_addr);
 		if (dev != NULL)
 			dev_put(dev);
-		if (security_secid_to_secctx(entry->secid,
-					     &secctx,
-					     &secctx_len) == 0) {
+		if (entry && security_secid_to_secctx(entry->secid,
+						      &secctx,
+						      &secctx_len) == 0) {
 			audit_log_format(audit_buf, " sec_obj=%s", secctx);
 			security_release_secctx(secctx, secctx_len);
 		}
@@ -781,23 +693,20 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
 				      const struct in6_addr *mask,
 				      struct netlbl_audit *audit_info)
 {
-	int ret_val = -ENOENT;
+	int ret_val = 0;
+	struct netlbl_af6list *list_entry;
 	struct netlbl_unlhsh_addr6 *entry;
-	struct audit_buffer *audit_buf = NULL;
+	struct audit_buffer *audit_buf;
 	struct net_device *dev;
-	char *secctx = NULL;
+	char *secctx;
 	u32 secctx_len;
 
 	spin_lock(&netlbl_unlhsh_lock);
-	entry = netlbl_unlhsh_search_addr6(addr, iface);
-	if (entry != NULL &&
-	    (ipv6_addr_equal(&entry->addr, addr) &&
-	     ipv6_addr_equal(&entry->mask, mask))) {
-		entry->valid = 0;
-		list_del_rcu(&entry->list);
-		ret_val = 0;
-	}
+	list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
 	spin_unlock(&netlbl_unlhsh_lock);
+	if (list_entry == NULL)
+		ret_val = -ENOENT;
+	entry = netlbl_unlhsh_addr6_entry(list_entry);
 
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
 					      audit_info);
@@ -808,9 +717,9 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
 					   addr, mask);
 		if (dev != NULL)
 			dev_put(dev);
-		if (security_secid_to_secctx(entry->secid,
-					     &secctx,
-					     &secctx_len) == 0) {
+		if (entry && security_secid_to_secctx(entry->secid,
+						      &secctx,
+						      &secctx_len) == 0) {
 			audit_log_format(audit_buf, " sec_obj=%s", secctx);
 			security_release_secctx(secctx, secctx_len);
 		}
@@ -836,16 +745,18 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
  */
 static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
 {
-	struct netlbl_unlhsh_addr4 *iter4;
-	struct netlbl_unlhsh_addr6 *iter6;
+	struct netlbl_af4list *iter4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+#endif /* IPv6 */
 
 	spin_lock(&netlbl_unlhsh_lock);
-	list_for_each_entry_rcu(iter4, &iface->addr4_list, list)
-		if (iter4->valid)
-			goto unlhsh_condremove_failure;
-	list_for_each_entry_rcu(iter6, &iface->addr6_list, list)
-		if (iter6->valid)
-			goto unlhsh_condremove_failure;
+	netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list)
+		goto unlhsh_condremove_failure;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list)
+		goto unlhsh_condremove_failure;
+#endif /* IPv6 */
 	iface->valid = 0;
 	if (iface->ifindex > 0)
 		list_del_rcu(&iface->list);
@@ -1349,7 +1260,7 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
 	if (addr4) {
 		struct in_addr addr_struct;
 
-		addr_struct.s_addr = addr4->addr;
+		addr_struct.s_addr = addr4->list.addr;
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV4ADDR,
 				  sizeof(struct in_addr),
@@ -1357,7 +1268,7 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
 		if (ret_val != 0)
 			goto list_cb_failure;
 
-		addr_struct.s_addr = addr4->mask;
+		addr_struct.s_addr = addr4->list.mask;
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV4MASK,
 				  sizeof(struct in_addr),
@@ -1370,14 +1281,14 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV6ADDR,
 				  sizeof(struct in6_addr),
-				  &addr6->addr);
+				  &addr6->list.addr);
 		if (ret_val != 0)
 			goto list_cb_failure;
 
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV6MASK,
 				  sizeof(struct in6_addr),
-				  &addr6->mask);
+				  &addr6->list.mask);
 		if (ret_val != 0)
 			goto list_cb_failure;
 
@@ -1425,9 +1336,11 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
 	u32 iter_bkt;
 	u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
 	struct netlbl_unlhsh_iface *iface;
-	struct netlbl_unlhsh_addr4 *addr4;
-	struct netlbl_unlhsh_addr6 *addr6;
 	struct list_head *iter_list;
+	struct netlbl_af4list *addr4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *addr6;
+#endif
 
 	cb_arg.nl_cb = cb;
 	cb_arg.skb = skb;
@@ -1442,38 +1355,38 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
 			if (!iface->valid ||
 			    iter_chain++ < skip_chain)
 				continue;
-			list_for_each_entry_rcu(addr4,
-						&iface->addr4_list,
-						list) {
-				if (!addr4->valid || iter_addr4++ < skip_addr4)
+			netlbl_af4list_foreach_rcu(addr4,
+						   &iface->addr4_list) {
+				if (iter_addr4++ < skip_addr4)
 					continue;
 				if (netlbl_unlabel_staticlist_gen(
-					             NLBL_UNLABEL_C_STATICLIST,
-						     iface,
-						     addr4,
-						     NULL,
-						     &cb_arg) < 0) {
+					      NLBL_UNLABEL_C_STATICLIST,
+					      iface,
+					      netlbl_unlhsh_addr4_entry(addr4),
+					      NULL,
+					      &cb_arg) < 0) {
 					iter_addr4--;
 					iter_chain--;
 					goto unlabel_staticlist_return;
 				}
 			}
-			list_for_each_entry_rcu(addr6,
-						&iface->addr6_list,
-						list) {
-				if (!addr6->valid || iter_addr6++ < skip_addr6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+			netlbl_af6list_foreach_rcu(addr6,
+						   &iface->addr6_list) {
+				if (iter_addr6++ < skip_addr6)
 					continue;
 				if (netlbl_unlabel_staticlist_gen(
-						     NLBL_UNLABEL_C_STATICLIST,
-						     iface,
-						     NULL,
-						     addr6,
-						     &cb_arg) < 0) {
+					      NLBL_UNLABEL_C_STATICLIST,
+					      iface,
+					      NULL,
+					      netlbl_unlhsh_addr6_entry(addr6),
+					      &cb_arg) < 0) {
 					iter_addr6--;
 					iter_chain--;
 					goto unlabel_staticlist_return;
 				}
 			}
+#endif /* IPv6 */
 		}
 	}
 
@@ -1504,9 +1417,12 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
 	struct netlbl_unlhsh_iface *iface;
 	u32 skip_addr4 = cb->args[0];
 	u32 skip_addr6 = cb->args[1];
-	u32 iter_addr4 = 0, iter_addr6 = 0;
-	struct netlbl_unlhsh_addr4 *addr4;
-	struct netlbl_unlhsh_addr6 *addr6;
+	u32 iter_addr4 = 0;
+	struct netlbl_af4list *addr4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	u32 iter_addr6 = 0;
+	struct netlbl_af6list *addr6;
+#endif
 
 	cb_arg.nl_cb = cb;
 	cb_arg.skb = skb;
@@ -1517,30 +1433,32 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
 	if (iface == NULL || !iface->valid)
 		goto unlabel_staticlistdef_return;
 
-	list_for_each_entry_rcu(addr4, &iface->addr4_list, list) {
-		if (!addr4->valid || iter_addr4++ < skip_addr4)
+	netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) {
+		if (iter_addr4++ < skip_addr4)
 			continue;
 		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
-					   iface,
-					   addr4,
-					   NULL,
-					   &cb_arg) < 0) {
+					      iface,
+					      netlbl_unlhsh_addr4_entry(addr4),
+					      NULL,
+					      &cb_arg) < 0) {
 			iter_addr4--;
 			goto unlabel_staticlistdef_return;
 		}
 	}
-	list_for_each_entry_rcu(addr6, &iface->addr6_list, list) {
-		if (!addr6->valid || iter_addr6++ < skip_addr6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) {
+		if (iter_addr6++ < skip_addr6)
 			continue;
 		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
-					   iface,
-					   NULL,
-					   addr6,
-					   &cb_arg) < 0) {
+					      iface,
+					      NULL,
+					      netlbl_unlhsh_addr6_entry(addr6),
+					      &cb_arg) < 0) {
 			iter_addr6--;
 			goto unlabel_staticlistdef_return;
 		}
 	}
+#endif /* IPv6 */
 
 unlabel_staticlistdef_return:
 	rcu_read_unlock();
@@ -1718,25 +1636,27 @@ int netlbl_unlabel_getattr(const struct sk_buff *skb,
 	switch (family) {
 	case PF_INET: {
 		struct iphdr *hdr4;
-		struct netlbl_unlhsh_addr4 *addr4;
+		struct netlbl_af4list *addr4;
 
 		hdr4 = ip_hdr(skb);
-		addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface);
+		addr4 = netlbl_af4list_search(hdr4->saddr,
+					      &iface->addr4_list);
 		if (addr4 == NULL)
 			goto unlabel_getattr_nolabel;
-		secattr->attr.secid = addr4->secid;
+		secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid;
 		break;
 	}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case PF_INET6: {
 		struct ipv6hdr *hdr6;
-		struct netlbl_unlhsh_addr6 *addr6;
+		struct netlbl_af6list *addr6;
 
 		hdr6 = ipv6_hdr(skb);
-		addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface);
+		addr6 = netlbl_af6list_search(&hdr6->saddr,
+					      &iface->addr6_list);
 		if (addr6 == NULL)
 			goto unlabel_getattr_nolabel;
-		secattr->attr.secid = addr6->secid;
+		secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid;
 		break;
 	}
 #endif /* IPv6 */
-- 
cgit v1.2.3


From 63c41688743760631188cf0f4ae986a6793ccb0a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:32 -0400
Subject: netlabel: Add network address selectors to the NetLabel/LSM domain
 mapping

This patch extends the NetLabel traffic labeling capabilities to individual
packets based not only on the LSM domain but the by the destination address
as well.  The changes here only affect the core NetLabel infrastructre,
changes to the NetLabel KAPI and individial protocol engines are also
required but are split out into a different patch to ease review.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 net/netlabel/netlabel_addrlist.c   | 130 ++++++++++++
 net/netlabel/netlabel_addrlist.h   |  15 ++
 net/netlabel/netlabel_domainhash.c | 290 +++++++++++++++++++++++----
 net/netlabel/netlabel_domainhash.h |  38 +++-
 net/netlabel/netlabel_kapi.c       |   7 +-
 net/netlabel/netlabel_mgmt.c       | 398 ++++++++++++++++++++++++++++---------
 net/netlabel/netlabel_mgmt.h       |  59 +++++-
 net/netlabel/netlabel_unlabeled.c  |  96 ++-------
 8 files changed, 811 insertions(+), 222 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_addrlist.c b/net/netlabel/netlabel_addrlist.c
index dd928aa52db..b0925a30335 100644
--- a/net/netlabel/netlabel_addrlist.c
+++ b/net/netlabel/netlabel_addrlist.c
@@ -39,6 +39,7 @@
 #include <linux/ipv6.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <linux/audit.h>
 
 #include "netlabel_addrlist.h"
 
@@ -69,6 +70,32 @@ struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
 	return NULL;
 }
 
+/**
+ * netlbl_af4list_search_exact - Search for an exact IPv4 address entry
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv4 address list given by @head.  If an exact match if found
+ * it is returned, otherwise NULL is returned.  The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr,
+						   __be32 mask,
+						   struct list_head *head)
+{
+	struct netlbl_af4list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid && iter->addr == addr && iter->mask == mask)
+			return iter;
+
+	return NULL;
+}
+
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /**
  * netlbl_af6list_search - Search for a matching IPv6 address entry
@@ -93,6 +120,33 @@ struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
 
 	return NULL;
 }
+
+/**
+ * netlbl_af6list_search_exact - Search for an exact IPv6 address entry
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv6 address list given by @head.  If an exact match if found
+ * it is returned, otherwise NULL is returned.  The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr,
+						   const struct in6_addr *mask,
+						   struct list_head *head)
+{
+	struct netlbl_af6list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ipv6_addr_equal(&iter->addr, addr) &&
+		    ipv6_addr_equal(&iter->mask, mask))
+			return iter;
+
+	return NULL;
+}
 #endif /* IPv6 */
 
 /**
@@ -256,3 +310,79 @@ struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
 	return NULL;
 }
 #endif /* IPv6 */
+
+/*
+ * Audit Helper Functions
+ */
+
+/**
+ * netlbl_af4list_audit_addr - Audit an IPv4 address
+ * @audit_buf: audit buffer
+ * @src: true if source address, false if destination
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv4 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+					int src, const char *dev,
+					__be32 addr, __be32 mask)
+{
+	u32 mask_val = ntohl(mask);
+	char *dir = (src ? "src" : "dst");
+
+	if (dev != NULL)
+		audit_log_format(audit_buf, " netif=%s", dev);
+	audit_log_format(audit_buf, " %s=" NIPQUAD_FMT, dir, NIPQUAD(addr));
+	if (mask_val != 0xffffffff) {
+		u32 mask_len = 0;
+		while (mask_val > 0) {
+			mask_val <<= 1;
+			mask_len++;
+		}
+		audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len);
+	}
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_audit_addr - Audit an IPv6 address
+ * @audit_buf: audit buffer
+ * @src: true if source address, false if destination
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv6 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+				 int src,
+				 const char *dev,
+				 const struct in6_addr *addr,
+				 const struct in6_addr *mask)
+{
+	char *dir = (src ? "src" : "dst");
+
+	if (dev != NULL)
+		audit_log_format(audit_buf, " netif=%s", dev);
+	audit_log_format(audit_buf, " %s=" NIP6_FMT, dir, NIP6(*addr));
+	if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
+		u32 mask_len = 0;
+		u32 mask_val;
+		int iter = -1;
+		while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
+			mask_len += 32;
+		mask_val = ntohl(mask->s6_addr32[iter]);
+		while (mask_val > 0) {
+			mask_val <<= 1;
+			mask_len++;
+		}
+		audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len);
+	}
+}
+#endif /* IPv6 */
diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h
index 0c41df057fa..0242bead405 100644
--- a/net/netlabel/netlabel_addrlist.h
+++ b/net/netlabel/netlabel_addrlist.h
@@ -36,6 +36,7 @@
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 #include <linux/in6.h>
+#include <linux/audit.h>
 
 /**
  * struct netlbl_af4list - NetLabel IPv4 address list
@@ -116,6 +117,12 @@ struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
 void netlbl_af4list_remove_entry(struct netlbl_af4list *entry);
 struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
 					     struct list_head *head);
+struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr,
+						   __be32 mask,
+						   struct list_head *head);
+void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+			       int src, const char *dev,
+			       __be32 addr, __be32 mask);
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 
@@ -169,6 +176,14 @@ struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
 void netlbl_af6list_remove_entry(struct netlbl_af6list *entry);
 struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
 					     struct list_head *head);
+struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr,
+						   const struct in6_addr *mask,
+						   struct list_head *head);
+void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+			       int src,
+			       const char *dev,
+			       const struct in6_addr *addr,
+			       const struct in6_addr *mask);
 #endif /* IPV6 */
 
 #endif
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index 0243f0c57b4..5fadf10e5dd 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -11,7 +11,7 @@
  */
 
 /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +40,7 @@
 #include <asm/bug.h>
 
 #include "netlabel_mgmt.h"
+#include "netlabel_addrlist.h"
 #include "netlabel_domainhash.h"
 #include "netlabel_user.h"
 
@@ -72,8 +73,28 @@ static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
 static void netlbl_domhsh_free_entry(struct rcu_head *entry)
 {
 	struct netlbl_dom_map *ptr;
+	struct netlbl_af4list *iter4;
+	struct netlbl_af4list *tmp4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+	struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
 
 	ptr = container_of(entry, struct netlbl_dom_map, rcu);
+	if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) {
+		netlbl_af4list_foreach_safe(iter4, tmp4,
+					    &ptr->type_def.addrsel->list4) {
+			netlbl_af4list_remove_entry(iter4);
+			kfree(netlbl_domhsh_addr4_entry(iter4));
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_safe(iter6, tmp6,
+					    &ptr->type_def.addrsel->list6) {
+			netlbl_af6list_remove_entry(iter6);
+			kfree(netlbl_domhsh_addr6_entry(iter6));
+		}
+#endif /* IPv6 */
+	}
 	kfree(ptr->domain);
 	kfree(ptr);
 }
@@ -156,6 +177,69 @@ static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
 	return entry;
 }
 
+/**
+ * netlbl_domhsh_audit_add - Generate an audit entry for an add event
+ * @entry: the entry being added
+ * @addr4: the IPv4 address information
+ * @addr6: the IPv6 address information
+ * @result: the result code
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Generate an audit record for adding a new NetLabel/LSM mapping entry with
+ * the given information.  Caller is responsibile for holding the necessary
+ * locks.
+ *
+ */
+static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
+				    struct netlbl_af4list *addr4,
+				    struct netlbl_af6list *addr6,
+				    int result,
+				    struct netlbl_audit *audit_info)
+{
+	struct audit_buffer *audit_buf;
+	struct cipso_v4_doi *cipsov4 = NULL;
+	u32 type;
+
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
+	if (audit_buf != NULL) {
+		audit_log_format(audit_buf, " nlbl_domain=%s",
+				 entry->domain ? entry->domain : "(default)");
+		if (addr4 != NULL) {
+			struct netlbl_domaddr4_map *map4;
+			map4 = netlbl_domhsh_addr4_entry(addr4);
+			type = map4->type;
+			cipsov4 = map4->type_def.cipsov4;
+			netlbl_af4list_audit_addr(audit_buf, 0, NULL,
+						  addr4->addr, addr4->mask);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		} else if (addr6 != NULL) {
+			struct netlbl_domaddr6_map *map6;
+			map6 = netlbl_domhsh_addr6_entry(addr6);
+			type = map6->type;
+			netlbl_af6list_audit_addr(audit_buf, 0, NULL,
+						  &addr6->addr, &addr6->mask);
+#endif /* IPv6 */
+		} else {
+			type = entry->type;
+			cipsov4 = entry->type_def.cipsov4;
+		}
+		switch (type) {
+		case NETLBL_NLTYPE_UNLABELED:
+			audit_log_format(audit_buf, " nlbl_protocol=unlbl");
+			break;
+		case NETLBL_NLTYPE_CIPSOV4:
+			BUG_ON(cipsov4 == NULL);
+			audit_log_format(audit_buf,
+					 " nlbl_protocol=cipsov4 cipso_doi=%u",
+					 cipsov4->doi);
+			break;
+		}
+		audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+}
+
 /*
  * Domain Hash Table Functions
  */
@@ -213,50 +297,106 @@ int __init netlbl_domhsh_init(u32 size)
 int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 		      struct netlbl_audit *audit_info)
 {
-	int ret_val;
-	u32 bkt;
-	struct audit_buffer *audit_buf;
-
-	entry->valid = 1;
-	INIT_RCU_HEAD(&entry->rcu);
+	int ret_val = 0;
+	struct netlbl_dom_map *entry_old;
+	struct netlbl_af4list *iter4;
+	struct netlbl_af4list *tmp4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+	struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
 
 	rcu_read_lock();
+
 	spin_lock(&netlbl_domhsh_lock);
-	if (entry->domain != NULL) {
-		bkt = netlbl_domhsh_hash(entry->domain);
-		if (netlbl_domhsh_search(entry->domain) == NULL)
+	if (entry->domain != NULL)
+		entry_old = netlbl_domhsh_search(entry->domain);
+	else
+		entry_old = netlbl_domhsh_search_def(entry->domain);
+	if (entry_old == NULL) {
+		entry->valid = 1;
+		INIT_RCU_HEAD(&entry->rcu);
+
+		if (entry->domain != NULL) {
+			u32 bkt = netlbl_domhsh_hash(entry->domain);
 			list_add_tail_rcu(&entry->list,
 				    &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
-		else
-			ret_val = -EEXIST;
-	} else {
-		INIT_LIST_HEAD(&entry->list);
-		if (rcu_dereference(netlbl_domhsh_def) == NULL)
+		} else {
+			INIT_LIST_HEAD(&entry->list);
 			rcu_assign_pointer(netlbl_domhsh_def, entry);
-		else
-			ret_val = -EEXIST;
-	}
-	spin_unlock(&netlbl_domhsh_lock);
-	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
-	if (audit_buf != NULL) {
-		audit_log_format(audit_buf,
-				 " nlbl_domain=%s",
-				 entry->domain ? entry->domain : "(default)");
-		switch (entry->type) {
-		case NETLBL_NLTYPE_UNLABELED:
-			audit_log_format(audit_buf, " nlbl_protocol=unlbl");
-			break;
-		case NETLBL_NLTYPE_CIPSOV4:
-			audit_log_format(audit_buf,
-					 " nlbl_protocol=cipsov4 cipso_doi=%u",
-					 entry->type_def.cipsov4->doi);
-			break;
 		}
-		audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
-		audit_log_end(audit_buf);
-	}
-	rcu_read_unlock();
 
+		if (entry->type == NETLBL_NLTYPE_ADDRSELECT) {
+			netlbl_af4list_foreach_rcu(iter4,
+					       &entry->type_def.addrsel->list4)
+				netlbl_domhsh_audit_add(entry, iter4, NULL,
+							ret_val, audit_info);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+			netlbl_af6list_foreach_rcu(iter6,
+					       &entry->type_def.addrsel->list6)
+				netlbl_domhsh_audit_add(entry, NULL, iter6,
+							ret_val, audit_info);
+#endif /* IPv6 */
+		} else
+			netlbl_domhsh_audit_add(entry, NULL, NULL,
+						ret_val, audit_info);
+	} else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT &&
+		   entry->type == NETLBL_NLTYPE_ADDRSELECT) {
+		struct list_head *old_list4;
+		struct list_head *old_list6;
+
+		old_list4 = &entry_old->type_def.addrsel->list4;
+		old_list6 = &entry_old->type_def.addrsel->list6;
+
+		/* we only allow the addition of address selectors if all of
+		 * the selectors do not exist in the existing domain map */
+		netlbl_af4list_foreach_rcu(iter4,
+					   &entry->type_def.addrsel->list4)
+			if (netlbl_af4list_search_exact(iter4->addr,
+							iter4->mask,
+							old_list4)) {
+				ret_val = -EEXIST;
+				goto add_return;
+			}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_rcu(iter6,
+					   &entry->type_def.addrsel->list6)
+			if (netlbl_af6list_search_exact(&iter6->addr,
+							&iter6->mask,
+							old_list6)) {
+				ret_val = -EEXIST;
+				goto add_return;
+			}
+#endif /* IPv6 */
+
+		netlbl_af4list_foreach_safe(iter4, tmp4,
+					    &entry->type_def.addrsel->list4) {
+			netlbl_af4list_remove_entry(iter4);
+			iter4->valid = 1;
+			ret_val = netlbl_af4list_add(iter4, old_list4);
+			netlbl_domhsh_audit_add(entry_old, iter4, NULL,
+						ret_val, audit_info);
+			if (ret_val != 0)
+				goto add_return;
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_safe(iter6, tmp6,
+					    &entry->type_def.addrsel->list6) {
+			netlbl_af6list_remove_entry(iter6);
+			iter6->valid = 1;
+			ret_val = netlbl_af6list_add(iter6, old_list6);
+			netlbl_domhsh_audit_add(entry_old, NULL, iter6,
+						ret_val, audit_info);
+			if (ret_val != 0)
+				goto add_return;
+		}
+#endif /* IPv6 */
+	} else
+		ret_val = -EINVAL;
+
+add_return:
+	spin_unlock(&netlbl_domhsh_lock);
+	rcu_read_unlock();
 	return ret_val;
 }
 
@@ -319,7 +459,19 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
 	}
 
 	if (ret_val == 0) {
+		struct netlbl_af4list *iter4;
+		struct netlbl_domaddr4_map *map4;
+
 		switch (entry->type) {
+		case NETLBL_NLTYPE_ADDRSELECT:
+			netlbl_af4list_foreach_rcu(iter4,
+					     &entry->type_def.addrsel->list4) {
+				map4 = netlbl_domhsh_addr4_entry(iter4);
+				cipso_v4_doi_putdef(map4->type_def.cipsov4);
+			}
+			/* no need to check the IPv6 list since we currently
+			 * support only unlabeled protocols for IPv6 */
+			break;
 		case NETLBL_NLTYPE_CIPSOV4:
 			cipso_v4_doi_putdef(entry->type_def.cipsov4);
 			break;
@@ -387,6 +539,70 @@ struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
 	return netlbl_domhsh_search_def(domain);
 }
 
+/**
+ * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ * @addr: the IP address to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain
+ * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
+ * responsible for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain,
+						       __be32 addr)
+{
+	struct netlbl_dom_map *dom_iter;
+	struct netlbl_af4list *addr_iter;
+
+	dom_iter = netlbl_domhsh_search_def(domain);
+	if (dom_iter == NULL)
+		return NULL;
+	if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
+		return NULL;
+
+	addr_iter = netlbl_af4list_search(addr,
+					  &dom_iter->type_def.addrsel->list4);
+	if (addr_iter == NULL)
+		return NULL;
+
+	return netlbl_domhsh_addr4_entry(addr_iter);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ * @addr: the IP address to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain
+ * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
+ * responsible for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain,
+						   const struct in6_addr *addr)
+{
+	struct netlbl_dom_map *dom_iter;
+	struct netlbl_af6list *addr_iter;
+
+	dom_iter = netlbl_domhsh_search_def(domain);
+	if (dom_iter == NULL)
+		return NULL;
+	if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
+		return NULL;
+
+	addr_iter = netlbl_af6list_search(addr,
+					  &dom_iter->type_def.addrsel->list6);
+	if (addr_iter == NULL)
+		return NULL;
+
+	return netlbl_domhsh_addr6_entry(addr_iter);
+}
+#endif /* IPv6 */
+
 /**
  * netlbl_domhsh_walk - Iterate through the domain mapping hash table
  * @skip_bkt: the number of buckets to skip at the start
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index afcc41a7432..bfcb6763a1a 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -11,7 +11,7 @@
  */
 
 /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,16 +36,43 @@
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 
+#include "netlabel_addrlist.h"
+
 /* Domain hash table size */
 /* XXX - currently this number is an uneducated guess */
 #define NETLBL_DOMHSH_BITSIZE       7
 
-/* Domain mapping definition struct */
+/* Domain mapping definition structures */
+#define netlbl_domhsh_addr4_entry(iter) \
+	container_of(iter, struct netlbl_domaddr4_map, list)
+struct netlbl_domaddr4_map {
+	u32 type;
+	union {
+		struct cipso_v4_doi *cipsov4;
+	} type_def;
+
+	struct netlbl_af4list list;
+};
+#define netlbl_domhsh_addr6_entry(iter) \
+	container_of(iter, struct netlbl_domaddr6_map, list)
+struct netlbl_domaddr6_map {
+	u32 type;
+
+	/* NOTE: no 'type_def' union needed at present since we don't currently
+	 *       support any IPv6 labeling protocols */
+
+	struct netlbl_af6list list;
+};
+struct netlbl_domaddr_map {
+	struct list_head list4;
+	struct list_head list6;
+};
 struct netlbl_dom_map {
 	char *domain;
 	u32 type;
 	union {
 		struct cipso_v4_doi *cipsov4;
+		struct netlbl_domaddr_map *addrsel;
 	} type_def;
 
 	u32 valid;
@@ -66,9 +93,16 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
 int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
 int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
+struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain,
+						       __be32 addr);
 int netlbl_domhsh_walk(u32 *skip_bkt,
 		     u32 *skip_chain,
 		     int (*callback) (struct netlbl_dom_map *entry, void *arg),
 		     void *cb_arg);
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain,
+						  const struct in6_addr *addr);
+#endif /* IPv6 */
+
 #endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 7d8ecea9391..8b820dc9806 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -419,7 +419,9 @@ int netlbl_enabled(void)
  * Attach the correct label to the given socket using the security attributes
  * specified in @secattr.  This function requires exclusive access to @sk,
  * which means it either needs to be in the process of being created or locked.
- * Returns zero on success, negative values on failure.
+ * Returns zero on success, -EDESTADDRREQ if the domain is configured to use
+ * network address selectors (can't blindly label the socket), and negative
+ * values on all other failures.
  *
  */
 int netlbl_sock_setattr(struct sock *sk,
@@ -433,6 +435,9 @@ int netlbl_sock_setattr(struct sock *sk,
 	if (dom_entry == NULL)
 		goto socket_setattr_return;
 	switch (dom_entry->type) {
+	case NETLBL_NLTYPE_ADDRSELECT:
+		ret_val = -EDESTADDRREQ;
+		break;
 	case NETLBL_NLTYPE_CIPSOV4:
 		ret_val = cipso_v4_sock_setattr(sk,
 						dom_entry->type_def.cipsov4,
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index c4e18c7bc0c..ee769ecaa13 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -32,9 +32,13 @@
 #include <linux/socket.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/in6.h>
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
 #include <asm/atomic.h>
@@ -71,79 +75,336 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
 };
 
 /*
- * NetLabel Command Handlers
+ * Helper Functions
  */
 
 /**
  * netlbl_mgmt_add - Handle an ADD message
- * @skb: the NETLINK buffer
  * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
  *
  * Description:
- * Process a user generated ADD message and add the domains from the message
- * to the hash table.  See netlabel.h for a description of the message format.
- * Returns zero on success, negative values on failure.
+ * Helper function for the ADD and ADDDEF messages to add the domain mappings
+ * from the message to the hash table.  See netlabel.h for a description of the
+ * message format.  Returns zero on success, negative values on failure.
  *
  */
-static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info)
+static int netlbl_mgmt_add_common(struct genl_info *info,
+				  struct netlbl_audit *audit_info)
 {
 	int ret_val = -EINVAL;
 	struct netlbl_dom_map *entry = NULL;
-	size_t tmp_size;
+	struct netlbl_domaddr_map *addrmap = NULL;
+	struct cipso_v4_doi *cipsov4 = NULL;
 	u32 tmp_val;
-	struct netlbl_audit audit_info;
-
-	if (!info->attrs[NLBL_MGMT_A_DOMAIN] ||
-	    !info->attrs[NLBL_MGMT_A_PROTOCOL])
-		goto add_failure;
-
-	netlbl_netlink_auditinfo(skb, &audit_info);
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (entry == NULL) {
 		ret_val = -ENOMEM;
 		goto add_failure;
 	}
-	tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]);
-	entry->domain = kmalloc(tmp_size, GFP_KERNEL);
-	if (entry->domain == NULL) {
-		ret_val = -ENOMEM;
-		goto add_failure;
-	}
 	entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]);
-	nla_strlcpy(entry->domain, info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size);
+	if (info->attrs[NLBL_MGMT_A_DOMAIN]) {
+		size_t tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]);
+		entry->domain = kmalloc(tmp_size, GFP_KERNEL);
+		if (entry->domain == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		nla_strlcpy(entry->domain,
+			    info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size);
+	}
+
+	/* NOTE: internally we allow/use a entry->type value of
+	 *       NETLBL_NLTYPE_ADDRSELECT but we don't currently allow users
+	 *       to pass that as a protocol value because we need to know the
+	 *       "real" protocol */
 
 	switch (entry->type) {
 	case NETLBL_NLTYPE_UNLABELED:
-		ret_val = netlbl_domhsh_add(entry, &audit_info);
 		break;
 	case NETLBL_NLTYPE_CIPSOV4:
 		if (!info->attrs[NLBL_MGMT_A_CV4DOI])
 			goto add_failure;
 
 		tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
-		entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val);
-		if (entry->type_def.cipsov4 == NULL)
+		cipsov4 = cipso_v4_doi_getdef(tmp_val);
+		if (cipsov4 == NULL)
 			goto add_failure;
-		ret_val = netlbl_domhsh_add(entry, &audit_info);
-		if (ret_val != 0)
-			cipso_v4_doi_putdef(entry->type_def.cipsov4);
+		entry->type_def.cipsov4 = cipsov4;
 		break;
 	default:
 		goto add_failure;
 	}
+
+	if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) {
+		struct in_addr *addr;
+		struct in_addr *mask;
+		struct netlbl_domaddr4_map *map;
+
+		addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL);
+		if (addrmap == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		INIT_LIST_HEAD(&addrmap->list4);
+		INIT_LIST_HEAD(&addrmap->list6);
+
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV4ADDR]) !=
+		    sizeof(struct in_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV4MASK]) !=
+		    sizeof(struct in_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		addr = nla_data(info->attrs[NLBL_MGMT_A_IPV4ADDR]);
+		mask = nla_data(info->attrs[NLBL_MGMT_A_IPV4MASK]);
+
+		map = kzalloc(sizeof(*map), GFP_KERNEL);
+		if (map == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		map->list.addr = addr->s_addr & mask->s_addr;
+		map->list.mask = mask->s_addr;
+		map->list.valid = 1;
+		map->type = entry->type;
+		if (cipsov4)
+			map->type_def.cipsov4 = cipsov4;
+
+		ret_val = netlbl_af4list_add(&map->list, &addrmap->list4);
+		if (ret_val != 0) {
+			kfree(map);
+			goto add_failure;
+		}
+
+		entry->type = NETLBL_NLTYPE_ADDRSELECT;
+		entry->type_def.addrsel = addrmap;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	} else if (info->attrs[NLBL_MGMT_A_IPV6ADDR]) {
+		struct in6_addr *addr;
+		struct in6_addr *mask;
+		struct netlbl_domaddr6_map *map;
+
+		addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL);
+		if (addrmap == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		INIT_LIST_HEAD(&addrmap->list4);
+		INIT_LIST_HEAD(&addrmap->list6);
+
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV6ADDR]) !=
+		    sizeof(struct in6_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV6MASK]) !=
+		    sizeof(struct in6_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		addr = nla_data(info->attrs[NLBL_MGMT_A_IPV6ADDR]);
+		mask = nla_data(info->attrs[NLBL_MGMT_A_IPV6MASK]);
+
+		map = kzalloc(sizeof(*map), GFP_KERNEL);
+		if (map == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		ipv6_addr_copy(&map->list.addr, addr);
+		map->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+		map->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+		map->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+		map->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+		ipv6_addr_copy(&map->list.mask, mask);
+		map->list.valid = 1;
+		map->type = entry->type;
+
+		ret_val = netlbl_af6list_add(&map->list, &addrmap->list6);
+		if (ret_val != 0) {
+			kfree(map);
+			goto add_failure;
+		}
+
+		entry->type = NETLBL_NLTYPE_ADDRSELECT;
+		entry->type_def.addrsel = addrmap;
+#endif /* IPv6 */
+	}
+
+	ret_val = netlbl_domhsh_add(entry, audit_info);
 	if (ret_val != 0)
 		goto add_failure;
 
 	return 0;
 
 add_failure:
+	if (cipsov4)
+		cipso_v4_doi_putdef(cipsov4);
 	if (entry)
 		kfree(entry->domain);
+	kfree(addrmap);
 	kfree(entry);
 	return ret_val;
 }
 
+/**
+ * netlbl_mgmt_listentry - List a NetLabel/LSM domain map entry
+ * @skb: the NETLINK buffer
+ * @entry: the map entry
+ *
+ * Description:
+ * This function is a helper function used by the LISTALL and LISTDEF command
+ * handlers.  The caller is responsibile for ensuring that the RCU read lock
+ * is held.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_listentry(struct sk_buff *skb,
+				 struct netlbl_dom_map *entry)
+{
+	int ret_val;
+	struct nlattr *nla_a;
+	struct nlattr *nla_b;
+	struct netlbl_af4list *iter4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+#endif
+
+	if (entry->domain != NULL) {
+		ret_val = nla_put_string(skb,
+					 NLBL_MGMT_A_DOMAIN, entry->domain);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	switch (entry->type) {
+	case NETLBL_NLTYPE_ADDRSELECT:
+		nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST);
+		if (nla_a == NULL)
+			return -ENOMEM;
+
+		netlbl_af4list_foreach_rcu(iter4,
+					   &entry->type_def.addrsel->list4) {
+			struct netlbl_domaddr4_map *map4;
+			struct in_addr addr_struct;
+
+			nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR);
+			if (nla_b == NULL)
+				return -ENOMEM;
+
+			addr_struct.s_addr = iter4->addr;
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV4ADDR,
+					  sizeof(struct in_addr),
+					  &addr_struct);
+			if (ret_val != 0)
+				return ret_val;
+			addr_struct.s_addr = iter4->mask;
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV4MASK,
+					  sizeof(struct in_addr),
+					  &addr_struct);
+			if (ret_val != 0)
+				return ret_val;
+			map4 = netlbl_domhsh_addr4_entry(iter4);
+			ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+					      map4->type);
+			if (ret_val != 0)
+				return ret_val;
+			switch (map4->type) {
+			case NETLBL_NLTYPE_CIPSOV4:
+				ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
+						  map4->type_def.cipsov4->doi);
+				if (ret_val != 0)
+					return ret_val;
+				break;
+			}
+
+			nla_nest_end(skb, nla_b);
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_rcu(iter6,
+					   &entry->type_def.addrsel->list6) {
+			struct netlbl_domaddr6_map *map6;
+
+			nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR);
+			if (nla_b == NULL)
+				return -ENOMEM;
+
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV6ADDR,
+					  sizeof(struct in6_addr),
+					  &iter6->addr);
+			if (ret_val != 0)
+				return ret_val;
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV6MASK,
+					  sizeof(struct in6_addr),
+					  &iter6->mask);
+			if (ret_val != 0)
+				return ret_val;
+			map6 = netlbl_domhsh_addr6_entry(iter6);
+			ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+					      map6->type);
+			if (ret_val != 0)
+				return ret_val;
+
+			nla_nest_end(skb, nla_b);
+		}
+#endif /* IPv6 */
+
+		nla_nest_end(skb, nla_a);
+		break;
+	case NETLBL_NLTYPE_UNLABELED:
+		ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type);
+		break;
+	case NETLBL_NLTYPE_CIPSOV4:
+		ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type);
+		if (ret_val != 0)
+			return ret_val;
+		ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
+				      entry->type_def.cipsov4->doi);
+		break;
+	}
+
+	return ret_val;
+}
+
+/*
+ * NetLabel Command Handlers
+ */
+
+/**
+ * netlbl_mgmt_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated ADD message and add the domains from the message
+ * to the hash table.  See netlabel.h for a description of the message format.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info)
+{
+	struct netlbl_audit audit_info;
+
+	if ((!info->attrs[NLBL_MGMT_A_DOMAIN]) ||
+	    (!info->attrs[NLBL_MGMT_A_PROTOCOL]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4ADDR] &&
+	     info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4MASK] &&
+	     info->attrs[NLBL_MGMT_A_IPV6MASK]) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL)))
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	return netlbl_mgmt_add_common(info, &audit_info);
+}
+
 /**
  * netlbl_mgmt_remove - Handle a REMOVE message
  * @skb: the NETLINK buffer
@@ -192,23 +453,9 @@ static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg)
 	if (data == NULL)
 		goto listall_cb_failure;
 
-	ret_val = nla_put_string(cb_arg->skb,
-				 NLBL_MGMT_A_DOMAIN,
-				 entry->domain);
+	ret_val = netlbl_mgmt_listentry(cb_arg->skb, entry);
 	if (ret_val != 0)
 		goto listall_cb_failure;
-	ret_val = nla_put_u32(cb_arg->skb, NLBL_MGMT_A_PROTOCOL, entry->type);
-	if (ret_val != 0)
-		goto listall_cb_failure;
-	switch (entry->type) {
-	case NETLBL_NLTYPE_CIPSOV4:
-		ret_val = nla_put_u32(cb_arg->skb,
-				      NLBL_MGMT_A_CV4DOI,
-				      entry->type_def.cipsov4->doi);
-		if (ret_val != 0)
-			goto listall_cb_failure;
-		break;
-	}
 
 	cb_arg->seq++;
 	return genlmsg_end(cb_arg->skb, data);
@@ -262,50 +509,22 @@ static int netlbl_mgmt_listall(struct sk_buff *skb,
  */
 static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info)
 {
-	int ret_val = -EINVAL;
-	struct netlbl_dom_map *entry = NULL;
-	u32 tmp_val;
 	struct netlbl_audit audit_info;
 
-	if (!info->attrs[NLBL_MGMT_A_PROTOCOL])
-		goto adddef_failure;
+	if ((!info->attrs[NLBL_MGMT_A_PROTOCOL]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4ADDR] &&
+	     info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4MASK] &&
+	     info->attrs[NLBL_MGMT_A_IPV6MASK]) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL)))
+		return -EINVAL;
 
 	netlbl_netlink_auditinfo(skb, &audit_info);
 
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (entry == NULL) {
-		ret_val = -ENOMEM;
-		goto adddef_failure;
-	}
-	entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]);
-
-	switch (entry->type) {
-	case NETLBL_NLTYPE_UNLABELED:
-		ret_val = netlbl_domhsh_add_default(entry, &audit_info);
-		break;
-	case NETLBL_NLTYPE_CIPSOV4:
-		if (!info->attrs[NLBL_MGMT_A_CV4DOI])
-			goto adddef_failure;
-
-		tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
-		entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val);
-		if (entry->type_def.cipsov4 == NULL)
-			goto adddef_failure;
-		ret_val = netlbl_domhsh_add_default(entry, &audit_info);
-		if (ret_val != 0)
-			cipso_v4_doi_putdef(entry->type_def.cipsov4);
-		break;
-	default:
-		goto adddef_failure;
-	}
-	if (ret_val != 0)
-		goto adddef_failure;
-
-	return 0;
-
-adddef_failure:
-	kfree(entry);
-	return ret_val;
+	return netlbl_mgmt_add_common(info, &audit_info);
 }
 
 /**
@@ -359,19 +578,10 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
 		ret_val = -ENOENT;
 		goto listdef_failure_lock;
 	}
-	ret_val = nla_put_u32(ans_skb, NLBL_MGMT_A_PROTOCOL, entry->type);
-	if (ret_val != 0)
-		goto listdef_failure_lock;
-	switch (entry->type) {
-	case NETLBL_NLTYPE_CIPSOV4:
-		ret_val = nla_put_u32(ans_skb,
-				      NLBL_MGMT_A_CV4DOI,
-				      entry->type_def.cipsov4->doi);
-		if (ret_val != 0)
-			goto listdef_failure_lock;
-		break;
-	}
+	ret_val = netlbl_mgmt_listentry(ans_skb, entry);
 	rcu_read_unlock();
+	if (ret_val != 0)
+		goto listdef_failure;
 
 	genlmsg_end(ans_skb, data);
 	return genlmsg_reply(ans_skb, info);
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
index a43bff169d6..05d96431f81 100644
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -45,6 +45,16 @@
  *     NLBL_MGMT_A_DOMAIN
  *     NLBL_MGMT_A_PROTOCOL
  *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_MGMT_A_IPV4ADDR
+ *     NLBL_MGMT_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_MGMT_A_IPV6ADDR
+ *     NLBL_MGMT_A_IPV6MASK
+ *
  *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
  *
  *     NLBL_MGMT_A_CV4DOI
@@ -68,13 +78,24 @@
  *   Required attributes:
  *
  *     NLBL_MGMT_A_DOMAIN
+ *
+ *   If the IP address selectors are not used the following attribute is
+ *   required:
+ *
  *     NLBL_MGMT_A_PROTOCOL
  *
- *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
+ *   If the IP address selectors are used then the following attritbute is
+ *   required:
+ *
+ *     NLBL_MGMT_A_SELECTORLIST
+ *
+ *   If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following
+ *   attributes are required:
  *
  *     NLBL_MGMT_A_CV4DOI
  *
- *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ *   If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other
+ *   attributes are required.
  *
  * o ADDDEF:
  *   Sent by an application to set the default domain mapping for the NetLabel
@@ -100,15 +121,23 @@
  *   application there is no payload.  On success the kernel should send a
  *   response using the following format.
  *
- *   Required attributes:
+ *   If the IP address selectors are not used the following attribute is
+ *   required:
  *
  *     NLBL_MGMT_A_PROTOCOL
  *
- *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
+ *   If the IP address selectors are used then the following attritbute is
+ *   required:
+ *
+ *     NLBL_MGMT_A_SELECTORLIST
+ *
+ *   If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following
+ *   attributes are required:
  *
  *     NLBL_MGMT_A_CV4DOI
  *
- *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ *   If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other
+ *   attributes are required.
  *
  * o PROTOCOLS:
  *   Sent by an application to request a list of configured NetLabel protocols
@@ -162,6 +191,26 @@ enum {
 	NLBL_MGMT_A_CV4DOI,
 	/* (NLA_U32)
 	 * the CIPSOv4 DOI value */
+	NLBL_MGMT_A_IPV6ADDR,
+	/* (NLA_BINARY, struct in6_addr)
+	 * an IPv6 address */
+	NLBL_MGMT_A_IPV6MASK,
+	/* (NLA_BINARY, struct in6_addr)
+	 * an IPv6 address mask */
+	NLBL_MGMT_A_IPV4ADDR,
+	/* (NLA_BINARY, struct in_addr)
+	 * an IPv4 address */
+	NLBL_MGMT_A_IPV4MASK,
+	/* (NLA_BINARY, struct in_addr)
+	 * and IPv4 address mask */
+	NLBL_MGMT_A_ADDRSELECTOR,
+	/* (NLA_NESTED)
+	 * an IP address selector, must contain an address, mask, and protocol
+	 * attribute plus any protocol specific attributes */
+	NLBL_MGMT_A_SELECTORLIST,
+	/* (NLA_NESTED)
+	 * the selector list, there must be at least one
+	 * NLBL_MGMT_A_ADDRSELECTOR attribute */
 	__NLBL_MGMT_A_MAX,
 };
 #define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index ab8131a8e48..e8a5c32b0f1 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -145,76 +145,6 @@ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1
 	[NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
 };
 
-/*
- * Audit Helper Functions
- */
-
-/**
- * netlbl_unlabel_audit_addr4 - Audit an IPv4 address
- * @audit_buf: audit buffer
- * @dev: network interface
- * @addr: IP address
- * @mask: IP address mask
- *
- * Description:
- * Write the IPv4 address and address mask, if necessary, to @audit_buf.
- *
- */
-static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf,
-				     const char *dev,
-				     __be32 addr, __be32 mask)
-{
-	u32 mask_val = ntohl(mask);
-
-	if (dev != NULL)
-		audit_log_format(audit_buf, " netif=%s", dev);
-	audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr));
-	if (mask_val != 0xffffffff) {
-		u32 mask_len = 0;
-		while (mask_val > 0) {
-			mask_val <<= 1;
-			mask_len++;
-		}
-		audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
-	}
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/**
- * netlbl_unlabel_audit_addr6 - Audit an IPv6 address
- * @audit_buf: audit buffer
- * @dev: network interface
- * @addr: IP address
- * @mask: IP address mask
- *
- * Description:
- * Write the IPv6 address and address mask, if necessary, to @audit_buf.
- *
- */
-static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf,
-				     const char *dev,
-				     const struct in6_addr *addr,
-				     const struct in6_addr *mask)
-{
-	if (dev != NULL)
-		audit_log_format(audit_buf, " netif=%s", dev);
-	audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr));
-	if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
-		u32 mask_len = 0;
-		u32 mask_val;
-		int iter = -1;
-		while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
-			mask_len += 32;
-		mask_val = ntohl(mask->s6_addr32[iter]);
-		while (mask_val > 0) {
-			mask_val <<= 1;
-			mask_len++;
-		}
-		audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
-	}
-}
-#endif /* IPv6 */
-
 /*
  * Unlabeled Connection Hash Table Functions
  */
@@ -571,10 +501,10 @@ static int netlbl_unlhsh_add(struct net *net,
 		mask4 = (struct in_addr *)mask;
 		ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
 		if (audit_buf != NULL)
-			netlbl_unlabel_audit_addr4(audit_buf,
-						   dev_name,
-						   addr4->s_addr,
-						   mask4->s_addr);
+			netlbl_af4list_audit_addr(audit_buf, 1,
+						  dev_name,
+						  addr4->s_addr,
+						  mask4->s_addr);
 		break;
 	}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -585,9 +515,9 @@ static int netlbl_unlhsh_add(struct net *net,
 		mask6 = (struct in6_addr *)mask;
 		ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
 		if (audit_buf != NULL)
-			netlbl_unlabel_audit_addr6(audit_buf,
-						   dev_name,
-						   addr6, mask6);
+			netlbl_af6list_audit_addr(audit_buf, 1,
+						  dev_name,
+						  addr6, mask6);
 		break;
 	}
 #endif /* IPv6 */
@@ -652,9 +582,9 @@ static int netlbl_unlhsh_remove_addr4(struct net *net,
 					      audit_info);
 	if (audit_buf != NULL) {
 		dev = dev_get_by_index(net, iface->ifindex);
-		netlbl_unlabel_audit_addr4(audit_buf,
-					   (dev != NULL ? dev->name : NULL),
-					   addr->s_addr, mask->s_addr);
+		netlbl_af4list_audit_addr(audit_buf, 1,
+					  (dev != NULL ? dev->name : NULL),
+					  addr->s_addr, mask->s_addr);
 		if (dev != NULL)
 			dev_put(dev);
 		if (entry && security_secid_to_secctx(entry->secid,
@@ -712,9 +642,9 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
 					      audit_info);
 	if (audit_buf != NULL) {
 		dev = dev_get_by_index(net, iface->ifindex);
-		netlbl_unlabel_audit_addr6(audit_buf,
-					   (dev != NULL ? dev->name : NULL),
-					   addr, mask);
+		netlbl_af6list_audit_addr(audit_buf, 1,
+					  (dev != NULL ? dev->name : NULL),
+					  addr, mask);
 		if (dev != NULL)
 			dev_put(dev);
 		if (entry && security_secid_to_secctx(entry->secid,
-- 
cgit v1.2.3


From 948bf85c1bc9a84754786a9d5dd99b7ecc46451e Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:32 -0400
Subject: netlabel: Add functionality to set the security attributes of a
 packet

This patch builds upon the new NetLabel address selector functionality by
providing the NetLabel KAPI and CIPSO engine support needed to enable the
new packet-based labeling.  The only new addition to the NetLabel KAPI at
this point is shown below:

 * int netlbl_skbuff_setattr(skb, family, secattr)

... and is designed to be called from a Netfilter hook after the packet's
IP header has been populated such as in the FORWARD or LOCAL_OUT hooks.

This patch also provides the necessary SELinux hooks to support this new
functionality.  Smack support is not currently included due to uncertainty
regarding the permissions needed to expand the Smack network access controls.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 net/ipv4/cipso_ipv4.c        | 222 +++++++++++++++++++++++++++++++++++--------
 net/netlabel/netlabel_kapi.c |  60 ++++++++++++
 2 files changed, 243 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index bf87eddfec3..e13d6dbb66a 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -13,7 +13,7 @@
  */
 
 /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -1665,48 +1665,27 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
 }
 
 /**
- * cipso_v4_sock_setattr - Add a CIPSO option to a socket
- * @sk: the socket
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
  * @doi_def: the CIPSO DOI to use
- * @secattr: the specific security attributes of the socket
+ * @secattr: the security attributes
  *
  * Description:
- * Set the CIPSO option on the given socket using the DOI definition and
- * security attributes passed to the function.  This function requires
- * exclusive access to @sk, which means it either needs to be in the
- * process of being created or locked.  Returns zero on success and negative
- * values on failure.
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function.  Returns the length of the option on success and
+ * negative values on failure.
  *
  */
-int cipso_v4_sock_setattr(struct sock *sk,
-			  const struct cipso_v4_doi *doi_def,
-			  const struct netlbl_lsm_secattr *secattr)
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+			   const struct cipso_v4_doi *doi_def,
+			   const struct netlbl_lsm_secattr *secattr)
 {
-	int ret_val = -EPERM;
+	int ret_val;
 	u32 iter;
-	unsigned char *buf;
-	u32 buf_len = 0;
-	u32 opt_len;
-	struct ip_options *opt = NULL;
-	struct inet_sock *sk_inet;
-	struct inet_connection_sock *sk_conn;
-
-	/* In the case of sock_create_lite(), the sock->sk field is not
-	 * defined yet but it is not a problem as the only users of these
-	 * "lite" PF_INET sockets are functions which do an accept() call
-	 * afterwards so we will label the socket as part of the accept(). */
-	if (sk == NULL)
-		return 0;
 
-	/* We allocate the maximum CIPSO option size here so we are probably
-	 * being a little wasteful, but it makes our life _much_ easier later
-	 * on and after all we are only talking about 40 bytes. */
-	buf_len = CIPSO_V4_OPT_LEN_MAX;
-	buf = kmalloc(buf_len, GFP_ATOMIC);
-	if (buf == NULL) {
-		ret_val = -ENOMEM;
-		goto socket_setattr_failure;
-	}
+	if (buf_len <= CIPSO_V4_HDR_LEN)
+		return -ENOSPC;
 
 	/* XXX - This code assumes only one tag per CIPSO option which isn't
 	 * really a good assumption to make but since we only support the MAC
@@ -1734,8 +1713,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
 						   buf_len - CIPSO_V4_HDR_LEN);
 			break;
 		default:
-			ret_val = -EPERM;
-			goto socket_setattr_failure;
+			return -EPERM;
 		}
 
 		iter++;
@@ -1743,9 +1721,58 @@ int cipso_v4_sock_setattr(struct sock *sk,
 		 iter < CIPSO_V4_TAG_MAXCNT &&
 		 doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
 	if (ret_val < 0)
-		goto socket_setattr_failure;
+		return ret_val;
 	cipso_v4_gentag_hdr(doi_def, buf, ret_val);
-	buf_len = CIPSO_V4_HDR_LEN + ret_val;
+	return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked.  Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+			  const struct cipso_v4_doi *doi_def,
+			  const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	unsigned char *buf = NULL;
+	u32 buf_len;
+	u32 opt_len;
+	struct ip_options *opt = NULL;
+	struct inet_sock *sk_inet;
+	struct inet_connection_sock *sk_conn;
+
+	/* In the case of sock_create_lite(), the sock->sk field is not
+	 * defined yet but it is not a problem as the only users of these
+	 * "lite" PF_INET sockets are functions which do an accept() call
+	 * afterwards so we will label the socket as part of the accept(). */
+	if (sk == NULL)
+		return 0;
+
+	/* We allocate the maximum CIPSO option size here so we are probably
+	 * being a little wasteful, but it makes our life _much_ easier later
+	 * on and after all we are only talking about 40 bytes. */
+	buf_len = CIPSO_V4_OPT_LEN_MAX;
+	buf = kmalloc(buf_len, GFP_ATOMIC);
+	if (buf == NULL) {
+		ret_val = -ENOMEM;
+		goto socket_setattr_failure;
+	}
+
+	ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (ret_val < 0)
+		goto socket_setattr_failure;
+	buf_len = ret_val;
 
 	/* We can't use ip_options_get() directly because it makes a call to
 	 * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
@@ -1853,6 +1880,123 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
 				secattr);
 }
 
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+			    const struct cipso_v4_doi *doi_def,
+			    const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct iphdr *iph;
+	struct ip_options *opt = &IPCB(skb)->opt;
+	unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+	u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+	u32 opt_len;
+	int len_delta;
+
+	buf_len = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (buf_len < 0)
+		return buf_len;
+	opt_len = (buf_len + 3) & ~3;
+
+	/* we overwrite any existing options to ensure that we have enough
+	 * room for the CIPSO option, the reason is that we _need_ to guarantee
+	 * that the security label is applied to the packet - we do the same
+	 * thing when using the socket options and it hasn't caused a problem,
+	 * if we need to we can always revisit this choice later */
+
+	len_delta = opt_len - opt->optlen;
+	/* if we don't ensure enough headroom we could panic on the skb_push()
+	 * call below so make sure we have enough, we are also "mangling" the
+	 * packet so we should probably do a copy-on-write call anyway */
+	ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+	if (ret_val < 0)
+		return ret_val;
+
+	if (len_delta > 0) {
+		/* we assume that the header + opt->optlen have already been
+		 * "pushed" in ip_options_build() or similar */
+		iph = ip_hdr(skb);
+		skb_push(skb, len_delta);
+		memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+		skb_reset_network_header(skb);
+		iph = ip_hdr(skb);
+	} else if (len_delta < 0) {
+		iph = ip_hdr(skb);
+		memset(iph + 1, IPOPT_NOP, opt->optlen);
+	} else
+		iph = ip_hdr(skb);
+
+	if (opt->optlen > 0)
+		memset(opt, 0, sizeof(*opt));
+	opt->optlen = opt_len;
+	opt->cipso = sizeof(struct iphdr);
+	opt->is_changed = 1;
+
+	/* we have to do the following because we are being called from a
+	 * netfilter hook which means the packet already has had the header
+	 * fields populated and the checksum calculated - yes this means we
+	 * are doing more work than needed but we do it to keep the core
+	 * stack clean and tidy */
+	memcpy(iph + 1, buf, buf_len);
+	if (opt_len > buf_len)
+		memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+	if (len_delta != 0) {
+		iph->ihl = 5 + (opt_len >> 2);
+		iph->tot_len = htons(skb->len);
+	}
+	ip_send_check(iph);
+
+	return 0;
+}
+
+/**
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CIPSO options from the given packet.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
+{
+	int ret_val;
+	struct iphdr *iph;
+	struct ip_options *opt = &IPCB(skb)->opt;
+	unsigned char *cipso_ptr;
+
+	if (opt->cipso == 0)
+		return 0;
+
+	/* since we are changing the packet we should make a copy */
+	ret_val = skb_cow(skb, skb_headroom(skb));
+	if (ret_val < 0)
+		return ret_val;
+
+	/* the easiest thing to do is just replace the cipso option with noop
+	 * options since we don't change the size of the packet, although we
+	 * still need to recalculate the checksum */
+
+	iph = ip_hdr(skb);
+	cipso_ptr = (unsigned char *)iph + opt->cipso;
+	memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+	opt->cipso = 0;
+	opt->is_changed = 1;
+
+	ip_send_check(iph);
+
+	return 0;
+}
+
 /**
  * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option
  * @skb: the packet
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 8b820dc9806..cc8047d1f50 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -472,6 +472,66 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
 	return cipso_v4_sock_getattr(sk, secattr);
 }
 
+/**
+ * netlbl_skbuff_setattr - Label a packet using the correct protocol
+ * @skb: the packet
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given packet using the security attributes
+ * specified in @secattr.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_skbuff_setattr(struct sk_buff *skb,
+			  u16 family,
+			  const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct iphdr *hdr4;
+	struct netlbl_domaddr4_map *af4_entry;
+
+	rcu_read_lock();
+	switch (family) {
+	case AF_INET:
+		hdr4 = ip_hdr(skb);
+		af4_entry = netlbl_domhsh_getentry_af4(secattr->domain,
+						       hdr4->daddr);
+		if (af4_entry == NULL) {
+			ret_val = -ENOENT;
+			goto skbuff_setattr_return;
+		}
+		switch (af4_entry->type) {
+		case NETLBL_NLTYPE_CIPSOV4:
+			ret_val = cipso_v4_skbuff_setattr(skb,
+						   af4_entry->type_def.cipsov4,
+						   secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			/* just delete the protocols we support for right now
+			 * but we could remove other protocols if needed */
+			ret_val = cipso_v4_skbuff_delattr(skb);
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		/* since we don't support any IPv6 labeling protocols right
+		 * now we can optimize everything away until we do */
+		ret_val = 0;
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = 0;
+	}
+
+skbuff_setattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
 /**
  * netlbl_skbuff_getattr - Determine the security attributes of a packet
  * @skb: the packet
-- 
cgit v1.2.3


From 014ab19a69c325f52d7bae54ceeda73d6307ae0c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:33 -0400
Subject: selinux: Set socket NetLabel based on connection endpoint

Previous work enabled the use of address based NetLabel selectors, which while
highly useful, brought the potential for additional per-packet overhead when
used.  This patch attempts to solve that by applying NetLabel socket labels
when sockets are connect()'d.  This should alleviate the per-packet NetLabel
labeling for all connected sockets (yes, it even works for connected DGRAM
sockets).

Signed-off-by: Paul Moore <paul.moore@hp.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 net/ipv4/cipso_ipv4.c        | 74 +++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_kapi.c | 78 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 151 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index e13d6dbb66a..23768b9d6b6 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1809,6 +1809,80 @@ socket_setattr_failure:
 	return ret_val;
 }
 
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+	u8 hdr_delta;
+	struct ip_options *opt;
+	struct inet_sock *sk_inet;
+
+	sk_inet = inet_sk(sk);
+	opt = sk_inet->opt;
+	if (opt == NULL || opt->cipso == 0)
+		return;
+
+	if (opt->srr || opt->rr || opt->ts || opt->router_alert) {
+		u8 cipso_len;
+		u8 cipso_off;
+		unsigned char *cipso_ptr;
+		int iter;
+		int optlen_new;
+
+		cipso_off = opt->cipso - sizeof(struct iphdr);
+		cipso_ptr = &opt->__data[cipso_off];
+		cipso_len = cipso_ptr[1];
+
+		if (opt->srr > opt->cipso)
+			opt->srr -= cipso_len;
+		if (opt->rr > opt->cipso)
+			opt->rr -= cipso_len;
+		if (opt->ts > opt->cipso)
+			opt->ts -= cipso_len;
+		if (opt->router_alert > opt->cipso)
+			opt->router_alert -= cipso_len;
+		opt->cipso = 0;
+
+		memmove(cipso_ptr, cipso_ptr + cipso_len,
+			opt->optlen - cipso_off - cipso_len);
+
+		/* determining the new total option length is tricky because of
+		 * the padding necessary, the only thing i can think to do at
+		 * this point is walk the options one-by-one, skipping the
+		 * padding at the end to determine the actual option size and
+		 * from there we can determine the new total option length */
+		iter = 0;
+		optlen_new = 0;
+		while (iter < opt->optlen)
+			if (opt->__data[iter] != IPOPT_NOP) {
+				iter += opt->__data[iter + 1];
+				optlen_new = iter;
+			} else
+				iter++;
+		hdr_delta = opt->optlen;
+		opt->optlen = (optlen_new + 3) & ~3;
+		hdr_delta -= opt->optlen;
+	} else {
+		/* only the cipso option was present on the socket so we can
+		 * remove the entire option struct */
+		sk_inet->opt = NULL;
+		hdr_delta = opt->optlen;
+		kfree(opt);
+	}
+
+	if (sk_inet->is_icsk && hdr_delta > 0) {
+		struct inet_connection_sock *sk_conn = inet_csk(sk);
+		sk_conn->icsk_ext_hdr_len -= hdr_delta;
+		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+	}
+}
+
 /**
  * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
  * @cipso: the CIPSO v4 option
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index cc8047d1f50..78fc557689b 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -455,6 +455,20 @@ socket_setattr_return:
 	return ret_val;
 }
 
+/**
+ * netlbl_sock_delattr - Delete all the NetLabel labels on a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Remove all the NetLabel labeling from @sk.  The caller is responsible for
+ * ensuring that @sk is locked.
+ *
+ */
+void netlbl_sock_delattr(struct sock *sk)
+{
+	cipso_v4_sock_delattr(sk);
+}
+
 /**
  * netlbl_sock_getattr - Determine the security attributes of a sock
  * @sk: the sock
@@ -472,6 +486,68 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
 	return cipso_v4_sock_getattr(sk, secattr);
 }
 
+/**
+ * netlbl_conn_setattr - Label a connected socket using the correct protocol
+ * @sk: the socket to label
+ * @addr: the destination address
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given connected socket using the security
+ * attributes specified in @secattr.  The caller is responsible for ensuring
+ * that @sk is locked.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_conn_setattr(struct sock *sk,
+			struct sockaddr *addr,
+			const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct sockaddr_in *addr4;
+	struct netlbl_domaddr4_map *af4_entry;
+
+	rcu_read_lock();
+	switch (addr->sa_family) {
+	case AF_INET:
+		addr4 = (struct sockaddr_in *)addr;
+		af4_entry = netlbl_domhsh_getentry_af4(secattr->domain,
+						       addr4->sin_addr.s_addr);
+		if (af4_entry == NULL) {
+			ret_val = -ENOENT;
+			goto conn_setattr_return;
+		}
+		switch (af4_entry->type) {
+		case NETLBL_NLTYPE_CIPSOV4:
+			ret_val = cipso_v4_sock_setattr(sk,
+						   af4_entry->type_def.cipsov4,
+						   secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			/* just delete the protocols we support for right now
+			 * but we could remove other protocols if needed */
+			cipso_v4_sock_delattr(sk);
+			ret_val = 0;
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		/* since we don't support any IPv6 labeling protocols right
+		 * now we can optimize everything away until we do */
+		ret_val = 0;
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = 0;
+	}
+
+conn_setattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
 /**
  * netlbl_skbuff_setattr - Label a packet using the correct protocol
  * @skb: the packet
-- 
cgit v1.2.3


From 15c45f7b2e81655f6eb500ec949c8bd70a04325a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:34 -0400
Subject: cipso: Add support for native local labeling and fixup mapping names

This patch accomplishes three minor tasks: add a new tag type for local
labeling, rename the CIPSO_V4_MAP_STD define to CIPSO_V4_MAP_TRANS and
replace some of the CIPSO "magic numbers" with constants from the header
file.  The first change allows CIPSO to support full LSM labels/contexts,
not just MLS attributes.  The second change brings the mapping names inline
with what userspace is using, compatibility is preserved since we don't
actually change the value.  The last change is to aid readability and help
prevent mistakes.

Signed-off-by: Paul Moore <paul.moore@hp.com>
---
 net/ipv4/cipso_ipv4.c            | 127 +++++++++++++++++++++++++++++++++------
 net/ipv4/ip_options.c            |   2 +-
 net/netlabel/netlabel_cipso_v4.c |  14 ++---
 net/netlabel/netlabel_cipso_v4.h |   4 +-
 net/netlabel/netlabel_kapi.c     |   4 +-
 5 files changed, 119 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 23768b9d6b6..490e035c6d9 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -109,6 +109,19 @@ int cipso_v4_rbm_strictvalid = 1;
  * be omitted. */
 #define CIPSO_V4_TAG_RNG_CAT_MAX      8
 
+/* Base length of the local tag (non-standard tag).
+ *  Tag definition (may change between kernel versions)
+ *
+ * 0          8          16         24         32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value  |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
+ *
+ */
+#define CIPSO_V4_TAG_LOC_BLEN         6
+
 /*
  * Helper Functions
  */
@@ -467,6 +480,10 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def)
 			if (doi_def->type != CIPSO_V4_MAP_PASS)
 				return -EINVAL;
 			break;
+		case CIPSO_V4_TAG_LOCAL:
+			if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+				return -EINVAL;
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -502,7 +519,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
 		return;
 
 	switch (doi_def->type) {
-	case CIPSO_V4_MAP_STD:
+	case CIPSO_V4_MAP_TRANS:
 		kfree(doi_def->map.std->lvl.cipso);
 		kfree(doi_def->map.std->lvl.local);
 		kfree(doi_def->map.std->cat.cipso);
@@ -673,7 +690,7 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
 	switch (doi_def->type) {
 	case CIPSO_V4_MAP_PASS:
 		return 0;
-	case CIPSO_V4_MAP_STD:
+	case CIPSO_V4_MAP_TRANS:
 		if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
 			return 0;
 		break;
@@ -702,7 +719,7 @@ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
 	case CIPSO_V4_MAP_PASS:
 		*net_lvl = host_lvl;
 		return 0;
-	case CIPSO_V4_MAP_STD:
+	case CIPSO_V4_MAP_TRANS:
 		if (host_lvl < doi_def->map.std->lvl.local_size &&
 		    doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
 			*net_lvl = doi_def->map.std->lvl.local[host_lvl];
@@ -736,7 +753,7 @@ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
 	case CIPSO_V4_MAP_PASS:
 		*host_lvl = net_lvl;
 		return 0;
-	case CIPSO_V4_MAP_STD:
+	case CIPSO_V4_MAP_TRANS:
 		map_tbl = doi_def->map.std;
 		if (net_lvl < map_tbl->lvl.cipso_size &&
 		    map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
@@ -773,7 +790,7 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
 	switch (doi_def->type) {
 	case CIPSO_V4_MAP_PASS:
 		return 0;
-	case CIPSO_V4_MAP_STD:
+	case CIPSO_V4_MAP_TRANS:
 		cipso_cat_size = doi_def->map.std->cat.cipso_size;
 		cipso_array = doi_def->map.std->cat.cipso;
 		for (;;) {
@@ -821,7 +838,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
 	u32 host_cat_size = 0;
 	u32 *host_cat_array = NULL;
 
-	if (doi_def->type == CIPSO_V4_MAP_STD) {
+	if (doi_def->type == CIPSO_V4_MAP_TRANS) {
 		host_cat_size = doi_def->map.std->cat.local_size;
 		host_cat_array = doi_def->map.std->cat.local;
 	}
@@ -836,7 +853,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
 		case CIPSO_V4_MAP_PASS:
 			net_spot = host_spot;
 			break;
-		case CIPSO_V4_MAP_STD:
+		case CIPSO_V4_MAP_TRANS:
 			if (host_spot >= host_cat_size)
 				return -EPERM;
 			net_spot = host_cat_array[host_spot];
@@ -882,7 +899,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
 	u32 net_cat_size = 0;
 	u32 *net_cat_array = NULL;
 
-	if (doi_def->type == CIPSO_V4_MAP_STD) {
+	if (doi_def->type == CIPSO_V4_MAP_TRANS) {
 		net_cat_size = doi_def->map.std->cat.cipso_size;
 		net_cat_array = doi_def->map.std->cat.cipso;
 	}
@@ -902,7 +919,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
 		case CIPSO_V4_MAP_PASS:
 			host_spot = net_spot;
 			break;
-		case CIPSO_V4_MAP_STD:
+		case CIPSO_V4_MAP_TRANS:
 			if (net_spot >= net_cat_size)
 				return -EPERM;
 			host_spot = net_cat_array[net_spot];
@@ -1238,7 +1255,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
 	} else
 		tag_len = 4;
 
-	buffer[0] = 0x01;
+	buffer[0] = CIPSO_V4_TAG_RBITMAP;
 	buffer[1] = tag_len;
 	buffer[3] = level;
 
@@ -1334,7 +1351,7 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
 	} else
 		tag_len = 4;
 
-	buffer[0] = 0x02;
+	buffer[0] = CIPSO_V4_TAG_ENUM;
 	buffer[1] = tag_len;
 	buffer[3] = level;
 
@@ -1430,7 +1447,7 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
 	} else
 		tag_len = 4;
 
-	buffer[0] = 0x05;
+	buffer[0] = CIPSO_V4_TAG_RANGE;
 	buffer[1] = tag_len;
 	buffer[3] = level;
 
@@ -1483,6 +1500,54 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
 	return 0;
 }
 
+/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag.  Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char *buffer,
+			       u32 buffer_len)
+{
+	if (!(secattr->flags & NETLBL_SECATTR_SECID))
+		return -EPERM;
+
+	buffer[0] = CIPSO_V4_TAG_LOCAL;
+	buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+	*(u32 *)&buffer[2] = secattr->attr.secid;
+
+	return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	secattr->attr.secid = *(u32 *)&tag[2];
+	secattr->flags |= NETLBL_SECATTR_SECID;
+
+	return 0;
+}
+
 /**
  * cipso_v4_validate - Validate a CIPSO option
  * @option: the start of the option, on error it is set to point to the error
@@ -1502,7 +1567,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
  *   that is unrecognized."
  *
  */
-int cipso_v4_validate(unsigned char **option)
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
 {
 	unsigned char *opt = *option;
 	unsigned char *tag;
@@ -1527,7 +1592,7 @@ int cipso_v4_validate(unsigned char **option)
 		goto validate_return_locked;
 	}
 
-	opt_iter = 6;
+	opt_iter = CIPSO_V4_HDR_LEN;
 	tag = opt + opt_iter;
 	while (opt_iter < opt_len) {
 		for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
@@ -1545,7 +1610,7 @@ int cipso_v4_validate(unsigned char **option)
 
 		switch (tag[0]) {
 		case CIPSO_V4_TAG_RBITMAP:
-			if (tag_len < 4) {
+			if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
 				err_offset = opt_iter + 1;
 				goto validate_return_locked;
 			}
@@ -1563,7 +1628,7 @@ int cipso_v4_validate(unsigned char **option)
 					err_offset = opt_iter + 3;
 					goto validate_return_locked;
 				}
-				if (tag_len > 4 &&
+				if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
 				    cipso_v4_map_cat_rbm_valid(doi_def,
 							    &tag[4],
 							    tag_len - 4) < 0) {
@@ -1573,7 +1638,7 @@ int cipso_v4_validate(unsigned char **option)
 			}
 			break;
 		case CIPSO_V4_TAG_ENUM:
-			if (tag_len < 4) {
+			if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
 				err_offset = opt_iter + 1;
 				goto validate_return_locked;
 			}
@@ -1583,7 +1648,7 @@ int cipso_v4_validate(unsigned char **option)
 				err_offset = opt_iter + 3;
 				goto validate_return_locked;
 			}
-			if (tag_len > 4 &&
+			if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
 			    cipso_v4_map_cat_enum_valid(doi_def,
 							&tag[4],
 							tag_len - 4) < 0) {
@@ -1592,7 +1657,7 @@ int cipso_v4_validate(unsigned char **option)
 			}
 			break;
 		case CIPSO_V4_TAG_RANGE:
-			if (tag_len < 4) {
+			if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
 				err_offset = opt_iter + 1;
 				goto validate_return_locked;
 			}
@@ -1602,7 +1667,7 @@ int cipso_v4_validate(unsigned char **option)
 				err_offset = opt_iter + 3;
 				goto validate_return_locked;
 			}
-			if (tag_len > 4 &&
+			if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
 			    cipso_v4_map_cat_rng_valid(doi_def,
 						       &tag[4],
 						       tag_len - 4) < 0) {
@@ -1610,6 +1675,19 @@ int cipso_v4_validate(unsigned char **option)
 				goto validate_return_locked;
 			}
 			break;
+		case CIPSO_V4_TAG_LOCAL:
+			/* This is a non-standard tag that we only allow for
+			 * local connections, so if the incoming interface is
+			 * not the loopback device drop the packet. */
+			if (!(skb->dev->flags & IFF_LOOPBACK)) {
+				err_offset = opt_iter;
+				goto validate_return_locked;
+			}
+			if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+			break;
 		default:
 			err_offset = opt_iter;
 			goto validate_return_locked;
@@ -1712,6 +1790,12 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
 						   &buf[CIPSO_V4_HDR_LEN],
 						   buf_len - CIPSO_V4_HDR_LEN);
 			break;
+		case CIPSO_V4_TAG_LOCAL:
+			ret_val = cipso_v4_gentag_loc(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
 		default:
 			return -EPERM;
 		}
@@ -1921,6 +2005,9 @@ static int cipso_v4_getattr(const unsigned char *cipso,
 	case CIPSO_V4_TAG_RANGE:
 		ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
 		break;
+	case CIPSO_V4_TAG_LOCAL:
+		ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
+		break;
 	}
 	if (ret_val == 0)
 		secattr->type = NETLBL_NLTYPE_CIPSOV4;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index be3f18a7a40..2c88da6e786 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -438,7 +438,7 @@ int ip_options_compile(struct net *net,
 				goto error;
 			}
 			opt->cipso = optptr - iph;
-			if (cipso_v4_validate(&optptr)) {
+			if (cipso_v4_validate(skb, &optptr)) {
 				pp_ptr = optptr;
 				goto error;
 			}
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 5c4f60bbc82..db83a67cbc7 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -132,9 +132,9 @@ static int netlbl_cipsov4_add_common(struct genl_info *info,
  * @info: the Generic NETLINK info block
  *
  * Description:
- * Create a new CIPSO_V4_MAP_STD DOI definition based on the given ADD message
- * and add it to the CIPSO V4 engine.  Return zero on success and non-zero on
- * error.
+ * Create a new CIPSO_V4_MAP_TRANS DOI definition based on the given ADD
+ * message and add it to the CIPSO V4 engine.  Return zero on success and
+ * non-zero on error.
  *
  */
 static int netlbl_cipsov4_add_std(struct genl_info *info)
@@ -164,7 +164,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info)
 		ret_val = -ENOMEM;
 		goto add_std_failure;
 	}
-	doi_def->type = CIPSO_V4_MAP_STD;
+	doi_def->type = CIPSO_V4_MAP_TRANS;
 
 	ret_val = netlbl_cipsov4_add_common(info, doi_def);
 	if (ret_val != 0)
@@ -393,8 +393,8 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
 
 	type = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE]);
 	switch (type) {
-	case CIPSO_V4_MAP_STD:
-		type_str = "std";
+	case CIPSO_V4_MAP_TRANS:
+		type_str = "trans";
 		ret_val = netlbl_cipsov4_add_std(info);
 		break;
 	case CIPSO_V4_MAP_PASS:
@@ -497,7 +497,7 @@ list_start:
 	nla_nest_end(ans_skb, nla_a);
 
 	switch (doi_def->type) {
-	case CIPSO_V4_MAP_STD:
+	case CIPSO_V4_MAP_TRANS:
 		nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST);
 		if (nla_a == NULL) {
 			ret_val = -ENOMEM;
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
index 220cb9d06b4..fb3957f1d69 100644
--- a/net/netlabel/netlabel_cipso_v4.h
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -45,7 +45,7 @@
  *     NLBL_CIPSOV4_A_MTYPE
  *     NLBL_CIPSOV4_A_TAGLST
  *
- *   If using CIPSO_V4_MAP_STD the following attributes are required:
+ *   If using CIPSO_V4_MAP_TRANS the following attributes are required:
  *
  *     NLBL_CIPSOV4_A_MLSLVLLST
  *     NLBL_CIPSOV4_A_MLSCATLST
@@ -76,7 +76,7 @@
  *     NLBL_CIPSOV4_A_MTYPE
  *     NLBL_CIPSOV4_A_TAGLST
  *
- *   If using CIPSO_V4_MAP_STD the following attributes are required:
+ *   If using CIPSO_V4_MAP_TRANS the following attributes are required:
  *
  *     NLBL_CIPSOV4_A_MLSLVLLST
  *     NLBL_CIPSOV4_A_MLSCATLST
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 78fc557689b..8435b15c3f7 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -157,8 +157,8 @@ cfg_cipsov4_add_map_return:
 					      audit_info);
 	if (audit_buf != NULL) {
 		switch (doi_type) {
-		case CIPSO_V4_MAP_STD:
-			type_str = "std";
+		case CIPSO_V4_MAP_TRANS:
+			type_str = "trans";
 			break;
 		case CIPSO_V4_MAP_PASS:
 			type_str = "pass";
-- 
cgit v1.2.3


From d91d40799165b0c84c97e7c71fb8039494ff07dc Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 10 Oct 2008 10:16:34 -0400
Subject: netlabel: Add configuration support for local labeling

Add the necessary NetLabel support for the new CIPSO mapping,
CIPSO_V4_MAP_LOCAL, which allows full LSM label/context support.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 net/netlabel/netlabel_cipso_v4.c | 41 ++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_cipso_v4.h |  6 ++++--
 net/netlabel/netlabel_kapi.c     |  3 +++
 3 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index db83a67cbc7..fff32b70efa 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -364,6 +364,43 @@ add_pass_failure:
 	return ret_val;
 }
 
+/**
+ * netlbl_cipsov4_add_local - Adds a CIPSO V4 DOI definition
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Create a new CIPSO_V4_MAP_LOCAL DOI definition based on the given ADD
+ * message and add it to the CIPSO V4 engine.  Return zero on success and
+ * non-zero on error.
+ *
+ */
+static int netlbl_cipsov4_add_local(struct genl_info *info)
+{
+	int ret_val;
+	struct cipso_v4_doi *doi_def = NULL;
+
+	if (!info->attrs[NLBL_CIPSOV4_A_TAGLST])
+		return -EINVAL;
+
+	doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+	if (doi_def == NULL)
+		return -ENOMEM;
+	doi_def->type = CIPSO_V4_MAP_LOCAL;
+
+	ret_val = netlbl_cipsov4_add_common(info, doi_def);
+	if (ret_val != 0)
+		goto add_local_failure;
+
+	ret_val = cipso_v4_doi_add(doi_def);
+	if (ret_val != 0)
+		goto add_local_failure;
+	return 0;
+
+add_local_failure:
+	cipso_v4_doi_free(doi_def);
+	return ret_val;
+}
+
 /**
  * netlbl_cipsov4_add - Handle an ADD message
  * @skb: the NETLINK buffer
@@ -401,6 +438,10 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
 		type_str = "pass";
 		ret_val = netlbl_cipsov4_add_pass(info);
 		break;
+	case CIPSO_V4_MAP_LOCAL:
+		type_str = "local";
+		ret_val = netlbl_cipsov4_add_local(info);
+		break;
 	}
 	if (ret_val == 0)
 		atomic_inc(&netlabel_mgmt_protocount);
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
index fb3957f1d69..c8a4079261f 100644
--- a/net/netlabel/netlabel_cipso_v4.h
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -50,7 +50,8 @@
  *     NLBL_CIPSOV4_A_MLSLVLLST
  *     NLBL_CIPSOV4_A_MLSCATLST
  *
- *   If using CIPSO_V4_MAP_PASS no additional attributes are required.
+ *   If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes
+ *   are required.
  *
  * o REMOVE:
  *   Sent by an application to remove a specific DOI mapping table from the
@@ -81,7 +82,8 @@
  *     NLBL_CIPSOV4_A_MLSLVLLST
  *     NLBL_CIPSOV4_A_MLSCATLST
  *
- *   If using CIPSO_V4_MAP_PASS no additional attributes are required.
+ *   If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes
+ *   are required.
  *
  * o LISTALL:
  *   This message is sent by an application to list the valid DOIs on the
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 8435b15c3f7..b32eceb3ab0 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -163,6 +163,9 @@ cfg_cipsov4_add_map_return:
 		case CIPSO_V4_MAP_PASS:
 			type_str = "pass";
 			break;
+		case CIPSO_V4_MAP_LOCAL:
+			type_str = "local";
+			break;
 		default:
 			type_str = "(unknown)";
 		}
-- 
cgit v1.2.3


From 8d4ba0347ccfea4f12e56e2484954b891411b74d Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 14:59:49 -0400
Subject: RPC/RDMA: refactor the inline memory registration code.

Refactor the memory registration and deregistration routines.
This saves stack space, makes the code more readable and prepares
to add the new FRMR registration methods.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 365 +++++++++++++++++++++++++-------------------
 1 file changed, 207 insertions(+), 158 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8ea283ecc52..d04208a02f6 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -863,6 +863,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	char *p;
 	size_t len;
 	int i, rc;
+	struct rpcrdma_mw *r;
 
 	buf->rb_max_requests = cdata->max_requests;
 	spin_lock_init(&buf->rb_lock);
@@ -873,7 +874,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	 *   2.  arrays of struct rpcrdma_req to fill in pointers
 	 *   3.  array of struct rpcrdma_rep for replies
 	 *   4.  padding, if any
-	 *   5.  mw's, if any
+	 *   5.  mw's or fmr's, if any
 	 * Send/recv buffers in req/rep need to be registered
 	 */
 
@@ -927,15 +928,13 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	 * and also reduce unbind-to-bind collision.
 	 */
 	INIT_LIST_HEAD(&buf->rb_mws);
+	r = (struct rpcrdma_mw *)p;
 	switch (ia->ri_memreg_strategy) {
 	case RPCRDMA_MTHCAFMR:
-		{
-		struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
-		struct ib_fmr_attr fa = {
-			RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
-		};
 		/* TBD we are perhaps overallocating here */
 		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			static struct ib_fmr_attr fa =
+				{ RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
 			r->r.fmr = ib_alloc_fmr(ia->ri_pd,
 				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
 				&fa);
@@ -948,12 +947,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 			list_add(&r->mw_list, &buf->rb_mws);
 			++r;
 		}
-		}
 		break;
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
-		{
-		struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
 		/* Allocate one extra request's worth, for full cycling */
 		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
 			r->r.mw = ib_alloc_mw(ia->ri_pd);
@@ -966,7 +962,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 			list_add(&r->mw_list, &buf->rb_mws);
 			++r;
 		}
-		}
 		break;
 	default:
 		break;
@@ -1046,6 +1041,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
 	int rc, i;
 	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+	struct rpcrdma_mw *r;
 
 	/* clean up in reverse order from create
 	 *   1.  recv mr memory (mr free, then kfree)
@@ -1065,7 +1061,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 		}
 		if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
 			while (!list_empty(&buf->rb_mws)) {
-				struct rpcrdma_mw *r;
 				r = list_entry(buf->rb_mws.next,
 					struct rpcrdma_mw, mw_list);
 				list_del(&r->mw_list);
@@ -1115,6 +1110,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
 	struct rpcrdma_req *req;
 	unsigned long flags;
+	int i;
+	struct rpcrdma_mw *r;
 
 	spin_lock_irqsave(&buffers->rb_lock, flags);
 	if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1135,9 +1132,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 	}
 	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
 	if (!list_empty(&buffers->rb_mws)) {
-		int i = RPCRDMA_MAX_SEGS - 1;
+		i = RPCRDMA_MAX_SEGS - 1;
 		do {
-			struct rpcrdma_mw *r;
 			r = list_entry(buffers->rb_mws.next,
 					struct rpcrdma_mw, mw_list);
 			list_del(&r->mw_list);
@@ -1329,15 +1325,202 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
 				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
 }
 
+static int
+rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+	int len, pageoff, i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		physaddrs[i] = seg->mr_dma;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+				physaddrs, i, seg1->mr_dma);
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_map_phys_fmr "
+			"%u@0x%llx+%i (%d)... status %i\n", __func__,
+			len, (unsigned long long)seg1->mr_dma,
+			pageoff, i, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+		seg1->mr_base = seg1->mr_dma + pageoff;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	LIST_HEAD(l);
+	int rc;
+
+	list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
+	rc = ib_unmap_fmr(&l);
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_unmap_fmr,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
+static int
+rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt)
+{
+	int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+				  IB_ACCESS_REMOTE_READ);
+	struct ib_mw_bind param;
+	int rc;
+
+	*nsegs = 1;
+	rpcrdma_map_one(ia, seg, writing);
+	param.mr = ia->ri_bind_mem;
+	param.wr_id = 0ULL;	/* no send cookie */
+	param.addr = seg->mr_dma;
+	param.length = seg->mr_len;
+	param.send_flags = 0;
+	param.mw_access_flags = mem_priv;
+
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+	rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_bind_mw "
+			"%u@0x%llx status %i\n",
+			__func__, seg->mr_len,
+			(unsigned long long)seg->mr_dma, rc);
+		rpcrdma_unmap_one(ia, seg);
+	} else {
+		seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+		seg->mr_base = param.addr;
+		seg->mr_nsegs = 1;
+	}
+	return rc;
+}
+
+static int
+rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt, void **r)
+{
+	struct ib_mw_bind param;
+	LIST_HEAD(l);
+	int rc;
+
+	BUG_ON(seg->mr_nsegs != 1);
+	param.mr = ia->ri_bind_mem;
+	param.addr = 0ULL;	/* unbind */
+	param.length = 0;
+	param.mw_access_flags = 0;
+	if (*r) {
+		param.wr_id = (u64) (unsigned long) *r;
+		param.send_flags = IB_SEND_SIGNALED;
+		INIT_CQCOUNT(&r_xprt->rx_ep);
+	} else {
+		param.wr_id = 0ULL;
+		param.send_flags = 0;
+		DECR_CQCOUNT(&r_xprt->rx_ep);
+	}
+	rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+	rpcrdma_unmap_one(ia, seg);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_(un)bind_mw,"
+			" status %i\n", __func__, rc);
+	else
+		*r = NULL;	/* will upcall on completion */
+	return rc;
+}
+
+static int
+rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+	int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+				  IB_ACCESS_REMOTE_READ);
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+	int len, i, rc = 0;
+
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (len = 0, i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		ipb[i].addr = seg->mr_dma;
+		ipb[i].size = seg->mr_len;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+			break;
+	}
+	seg1->mr_base = seg1->mr_dma;
+	seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+				ipb, i, mem_priv, &seg1->mr_base);
+	if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+		rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+		dprintk("RPC:       %s: failed ib_reg_phys_mr "
+			"%u@0x%llx (%d)... status %i\n",
+			__func__, len,
+			(unsigned long long)seg1->mr_dma, i, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int rc;
+
+	rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+	seg1->mr_chunk.rl_mr = NULL;
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_dereg_mr,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
 int
 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 			int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
-				  IB_ACCESS_REMOTE_READ);
-	struct rpcrdma_mr_seg *seg1 = seg;
-	int i;
 	int rc = 0;
 
 	switch (ia->ri_memreg_strategy) {
@@ -1352,114 +1535,20 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 		break;
 #endif
 
-	/* Registration using fast memory registration */
+	/* Registration using fmr memory registration */
 	case RPCRDMA_MTHCAFMR:
-		{
-		u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
-		int len, pageoff = offset_in_page(seg->mr_offset);
-		seg1->mr_offset -= pageoff;	/* start of page */
-		seg1->mr_len += pageoff;
-		len = -pageoff;
-		if (nsegs > RPCRDMA_MAX_DATA_SEGS)
-			nsegs = RPCRDMA_MAX_DATA_SEGS;
-		for (i = 0; i < nsegs;) {
-			rpcrdma_map_one(ia, seg, writing);
-			physaddrs[i] = seg->mr_dma;
-			len += seg->mr_len;
-			++seg;
-			++i;
-			/* Check for holes */
-			if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
-			    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
-				break;
-		}
-		nsegs = i;
-		rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
-					physaddrs, nsegs, seg1->mr_dma);
-		if (rc) {
-			dprintk("RPC:       %s: failed ib_map_phys_fmr "
-				"%u@0x%llx+%i (%d)... status %i\n", __func__,
-				len, (unsigned long long)seg1->mr_dma,
-				pageoff, nsegs, rc);
-			while (nsegs--)
-				rpcrdma_unmap_one(ia, --seg);
-		} else {
-			seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
-			seg1->mr_base = seg1->mr_dma + pageoff;
-			seg1->mr_nsegs = nsegs;
-			seg1->mr_len = len;
-		}
-		}
+		rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
 		break;
 
 	/* Registration using memory windows */
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
-		{
-		struct ib_mw_bind param;
-		rpcrdma_map_one(ia, seg, writing);
-		param.mr = ia->ri_bind_mem;
-		param.wr_id = 0ULL;	/* no send cookie */
-		param.addr = seg->mr_dma;
-		param.length = seg->mr_len;
-		param.send_flags = 0;
-		param.mw_access_flags = mem_priv;
-
-		DECR_CQCOUNT(&r_xprt->rx_ep);
-		rc = ib_bind_mw(ia->ri_id->qp,
-					seg->mr_chunk.rl_mw->r.mw, &param);
-		if (rc) {
-			dprintk("RPC:       %s: failed ib_bind_mw "
-				"%u@0x%llx status %i\n",
-				__func__, seg->mr_len,
-				(unsigned long long)seg->mr_dma, rc);
-			rpcrdma_unmap_one(ia, seg);
-		} else {
-			seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
-			seg->mr_base = param.addr;
-			seg->mr_nsegs = 1;
-			nsegs = 1;
-		}
-		}
+		rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
 		break;
 
 	/* Default registration each time */
 	default:
-		{
-		struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
-		int len = 0;
-		if (nsegs > RPCRDMA_MAX_DATA_SEGS)
-			nsegs = RPCRDMA_MAX_DATA_SEGS;
-		for (i = 0; i < nsegs;) {
-			rpcrdma_map_one(ia, seg, writing);
-			ipb[i].addr = seg->mr_dma;
-			ipb[i].size = seg->mr_len;
-			len += seg->mr_len;
-			++seg;
-			++i;
-			/* Check for holes */
-			if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
-			    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
-				break;
-		}
-		nsegs = i;
-		seg1->mr_base = seg1->mr_dma;
-		seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
-					ipb, nsegs, mem_priv, &seg1->mr_base);
-		if (IS_ERR(seg1->mr_chunk.rl_mr)) {
-			rc = PTR_ERR(seg1->mr_chunk.rl_mr);
-			dprintk("RPC:       %s: failed ib_reg_phys_mr "
-				"%u@0x%llx (%d)... status %i\n",
-				__func__, len,
-				(unsigned long long)seg1->mr_dma, nsegs, rc);
-			while (nsegs--)
-				rpcrdma_unmap_one(ia, --seg);
-		} else {
-			seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
-			seg1->mr_nsegs = nsegs;
-			seg1->mr_len = len;
-		}
-		}
+		rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
 		break;
 	}
 	if (rc)
@@ -1473,7 +1562,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 		struct rpcrdma_xprt *r_xprt, void *r)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_mr_seg *seg1 = seg;
 	int nsegs = seg->mr_nsegs, rc;
 
 	switch (ia->ri_memreg_strategy) {
@@ -1487,55 +1575,16 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 #endif
 
 	case RPCRDMA_MTHCAFMR:
-		{
-		LIST_HEAD(l);
-		list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
-		rc = ib_unmap_fmr(&l);
-		while (seg1->mr_nsegs--)
-			rpcrdma_unmap_one(ia, seg++);
-		}
-		if (rc)
-			dprintk("RPC:       %s: failed ib_unmap_fmr,"
-				" status %i\n", __func__, rc);
+		rc = rpcrdma_deregister_fmr_external(seg, ia);
 		break;
 
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
-		{
-		struct ib_mw_bind param;
-		BUG_ON(nsegs != 1);
-		param.mr = ia->ri_bind_mem;
-		param.addr = 0ULL;	/* unbind */
-		param.length = 0;
-		param.mw_access_flags = 0;
-		if (r) {
-			param.wr_id = (u64) (unsigned long) r;
-			param.send_flags = IB_SEND_SIGNALED;
-			INIT_CQCOUNT(&r_xprt->rx_ep);
-		} else {
-			param.wr_id = 0ULL;
-			param.send_flags = 0;
-			DECR_CQCOUNT(&r_xprt->rx_ep);
-		}
-		rc = ib_bind_mw(ia->ri_id->qp,
-				seg->mr_chunk.rl_mw->r.mw, &param);
-		rpcrdma_unmap_one(ia, seg);
-		}
-		if (rc)
-			dprintk("RPC:       %s: failed ib_(un)bind_mw,"
-				" status %i\n", __func__, rc);
-		else
-			r = NULL;	/* will upcall on completion */
+		rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
 		break;
 
 	default:
-		rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
-		seg1->mr_chunk.rl_mr = NULL;
-		while (seg1->mr_nsegs--)
-			rpcrdma_unmap_one(ia, seg++);
-		if (rc)
-			dprintk("RPC:       %s: failed ib_dereg_mr,"
-				" status %i\n", __func__, rc);
+		rc = rpcrdma_deregister_default_external(seg, ia);
 		break;
 	}
 	if (r) {
-- 
cgit v1.2.3


From fe9053b30bb48b99f7b45541249f5cfe96bdf7f7 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 14:59:59 -0400
Subject: RPC/RDMA: add data types and new FRMR memory registration enum.

Internal RPC/RDMA structure updates in preparation for FRMR support.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Acked-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/xprt_rdma.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2427822f8bd..05b7898e1f4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -58,6 +58,8 @@ struct rpcrdma_ia {
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
 	struct ib_mr		*ri_bind_mem;
+	u32			ri_dma_lkey;
+	int			ri_have_dma_lkey;
 	struct completion	ri_done;
 	int			ri_async_rc;
 	enum rpcrdma_memreg	ri_memreg_strategy;
@@ -156,6 +158,10 @@ struct rpcrdma_mr_seg {		/* chunk descriptors */
 			union {
 				struct ib_mw	*mw;
 				struct ib_fmr	*fmr;
+				struct {
+					struct ib_fast_reg_page_list *fr_pgl;
+					struct ib_mr *fr_mr;
+				} frmr;
 			} r;
 			struct list_head mw_list;
 		} *rl_mw;
@@ -198,7 +204,7 @@ struct rpcrdma_buffer {
 	atomic_t	rb_credits;	/* most recent server credits */
 	unsigned long	rb_cwndscale;	/* cached framework rpc_cwndscale */
 	int		rb_max_requests;/* client max requests */
-	struct list_head rb_mws;	/* optional memory windows/fmrs */
+	struct list_head rb_mws;	/* optional memory windows/fmrs/frmrs */
 	int		rb_send_index;
 	struct rpcrdma_req	**rb_send_bufs;
 	int		rb_recv_index;
-- 
cgit v1.2.3


From bd7ed1d13304d914648dacec4dbb9145aaae614e Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:00:09 -0400
Subject: RPC/RDMA: check selected memory registration mode at runtime.

At transport creation, check for, and use, any local dma lkey.
Then, check that the selected memory registration mode is in fact
supported by the RDMA adapter selected for the mount. Fall back
to best alternative if not.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Acked-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 95 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index d04208a02f6..0f3b43148b7 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -423,7 +423,8 @@ rpcrdma_clean_cq(struct ib_cq *cq)
 int
 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 {
-	int rc;
+	int rc, mem_priv;
+	struct ib_device_attr devattr;
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 
 	init_completion(&ia->ri_done);
@@ -442,6 +443,53 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		goto out2;
 	}
 
+	/*
+	 * Query the device to determine if the requested memory
+	 * registration strategy is supported. If it isn't, set the
+	 * strategy to a globally supported model.
+	 */
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+		ia->ri_have_dma_lkey = 1;
+		ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+	}
+
+	switch (memreg) {
+	case RPCRDMA_MEMWINDOWS:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+		if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+			dprintk("RPC:       %s: MEMWINDOWS registration "
+				"specified but not supported by adapter, "
+				"using slower RPCRDMA_REGISTER\n",
+				__func__);
+			memreg = RPCRDMA_REGISTER;
+		}
+		break;
+	case RPCRDMA_MTHCAFMR:
+		if (!ia->ri_id->device->alloc_fmr) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+			dprintk("RPC:       %s: MTHCAFMR registration "
+				"specified but not supported by adapter, "
+				"using riskier RPCRDMA_ALLPHYSICAL\n",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+#else
+			dprintk("RPC:       %s: MTHCAFMR registration "
+				"specified but not supported by adapter, "
+				"using slower RPCRDMA_REGISTER\n",
+				__func__);
+			memreg = RPCRDMA_REGISTER;
+#endif
+		}
+		break;
+	}
+
 	/*
 	 * Optionally obtain an underlying physical identity mapping in
 	 * order to do a memory window-based bind. This base registration
@@ -450,22 +498,27 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	 * revoked after the corresponding completion similar to a storage
 	 * adapter.
 	 */
-	if (memreg > RPCRDMA_REGISTER) {
-		int mem_priv = IB_ACCESS_LOCAL_WRITE;
-		switch (memreg) {
+	switch (memreg) {
+	case RPCRDMA_BOUNCEBUFFERS:
+	case RPCRDMA_REGISTER:
+		break;
 #if RPCRDMA_PERSISTENT_REGISTRATION
-		case RPCRDMA_ALLPHYSICAL:
-			mem_priv |= IB_ACCESS_REMOTE_WRITE;
-			mem_priv |= IB_ACCESS_REMOTE_READ;
-			break;
+	case RPCRDMA_ALLPHYSICAL:
+		mem_priv = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE |
+				IB_ACCESS_REMOTE_READ;
+		goto register_setup;
 #endif
-		case RPCRDMA_MEMWINDOWS_ASYNC:
-		case RPCRDMA_MEMWINDOWS:
-			mem_priv |= IB_ACCESS_MW_BIND;
-			break;
-		default:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		mem_priv = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_MW_BIND;
+		goto register_setup;
+	case RPCRDMA_MTHCAFMR:
+		if (ia->ri_have_dma_lkey)
 			break;
-		}
+		mem_priv = IB_ACCESS_LOCAL_WRITE;
+	register_setup:
 		ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
 		if (IS_ERR(ia->ri_bind_mem)) {
 			printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -475,7 +528,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 			memreg = RPCRDMA_REGISTER;
 			ia->ri_bind_mem = NULL;
 		}
+		break;
+	default:
+		printk(KERN_ERR "%s: invalid memory registration mode %d\n",
+				__func__, memreg);
+		rc = -EINVAL;
+		goto out2;
 	}
+	dprintk("RPC:       %s: memory registration strategy is %d\n",
+		__func__, memreg);
 
 	/* Else will do memory reg/dereg for each chunk */
 	ia->ri_memreg_strategy = memreg;
@@ -1248,7 +1309,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
 			va, len, DMA_BIDIRECTIONAL);
 	iov->length = len;
 
-	if (ia->ri_bind_mem != NULL) {
+	if (ia->ri_have_dma_lkey) {
+		*mrp = NULL;
+		iov->lkey = ia->ri_dma_lkey;
+		return 0;
+	} else if (ia->ri_bind_mem != NULL) {
 		*mrp = NULL;
 		iov->lkey = ia->ri_bind_mem->lkey;
 		return 0;
-- 
cgit v1.2.3


From 3197d309f5fb042499b2c4c8f2fcb67372df5201 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:00:20 -0400
Subject: RPC/RDMA: support FRMR client memory registration.

Configure, detect and use "fastreg" support from IB/iWARP verbs
layer to perform RPC/RDMA memory registration.

Make FRMR the default memreg mode (will fall back if not supported
by the selected RDMA adapter).

This allows full and optimal operation over the cxgb3 adapter, and others.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Acked-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/transport.c |   6 +-
 net/sunrpc/xprtrdma/verbs.c     | 167 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index a564c1a39ec..89970b0a4cc 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -70,11 +70,7 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
 static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
-#if !RPCRDMA_PERSISTENT_REGISTRATION
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
-#else
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
-#endif
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
 
 #ifdef RPC_DEBUG
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 0f3b43148b7..39a165202d8 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -485,6 +485,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 				"using slower RPCRDMA_REGISTER\n",
 				__func__);
 			memreg = RPCRDMA_REGISTER;
+#endif
+		}
+		break;
+	case RPCRDMA_FRMR:
+		/* Requires both frmr reg and local dma lkey */
+		if ((devattr.device_cap_flags &
+		     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+		    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+			dprintk("RPC:       %s: FRMR registration "
+				"specified but not supported by adapter, "
+				"using riskier RPCRDMA_ALLPHYSICAL\n",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+#else
+			dprintk("RPC:       %s: FRMR registration "
+				"specified but not supported by adapter, "
+				"using slower RPCRDMA_REGISTER\n",
+				__func__);
+			memreg = RPCRDMA_REGISTER;
 #endif
 		}
 		break;
@@ -501,6 +521,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	switch (memreg) {
 	case RPCRDMA_BOUNCEBUFFERS:
 	case RPCRDMA_REGISTER:
+	case RPCRDMA_FRMR:
 		break;
 #if RPCRDMA_PERSISTENT_REGISTRATION
 	case RPCRDMA_ALLPHYSICAL:
@@ -602,6 +623,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.srq = NULL;
 	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		/* Add room for frmr register and invalidate WRs */
+		ep->rep_attr.cap.max_send_wr *= 3;
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+			return -EINVAL;
+		break;
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
 		/* Add room for mw_binds+unbinds - overkill! */
@@ -684,6 +711,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 		break;
 	case RPCRDMA_MTHCAFMR:
 	case RPCRDMA_REGISTER:
+	case RPCRDMA_FRMR:
 		ep->rep_remote_cma.responder_resources = cdata->max_requests *
 				(RPCRDMA_MAX_DATA_SEGS / 8);
 		break;
@@ -935,7 +963,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	 *   2.  arrays of struct rpcrdma_req to fill in pointers
 	 *   3.  array of struct rpcrdma_rep for replies
 	 *   4.  padding, if any
-	 *   5.  mw's or fmr's, if any
+	 *   5.  mw's, fmr's or frmr's, if any
 	 * Send/recv buffers in req/rep need to be registered
 	 */
 
@@ -943,6 +971,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
 	len += cdata->padding;
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
 	case RPCRDMA_MTHCAFMR:
 		/* TBD we are perhaps overallocating here */
 		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
@@ -991,6 +1023,30 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	INIT_LIST_HEAD(&buf->rb_mws);
 	r = (struct rpcrdma_mw *)p;
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+							 RPCRDMA_MAX_SEGS);
+			if (IS_ERR(r->r.frmr.fr_mr)) {
+				rc = PTR_ERR(r->r.frmr.fr_mr);
+				dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			r->r.frmr.fr_pgl =
+				ib_alloc_fast_reg_page_list(ia->ri_id->device,
+							    RPCRDMA_MAX_SEGS);
+			if (IS_ERR(r->r.frmr.fr_pgl)) {
+				rc = PTR_ERR(r->r.frmr.fr_pgl);
+				dprintk("RPC:       %s: "
+					"ib_alloc_fast_reg_page_list "
+					"failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		break;
 	case RPCRDMA_MTHCAFMR:
 		/* TBD we are perhaps overallocating here */
 		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
@@ -1126,6 +1182,15 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 					struct rpcrdma_mw, mw_list);
 				list_del(&r->mw_list);
 				switch (ia->ri_memreg_strategy) {
+				case RPCRDMA_FRMR:
+					rc = ib_dereg_mr(r->r.frmr.fr_mr);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dereg_mr"
+							" failed %i\n",
+							__func__, rc);
+					ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+					break;
 				case RPCRDMA_MTHCAFMR:
 					rc = ib_dealloc_fmr(r->r.fmr);
 					if (rc)
@@ -1228,6 +1293,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 		req->rl_reply = NULL;
 	}
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
 	case RPCRDMA_MTHCAFMR:
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
@@ -1390,6 +1456,96 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
 				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
 }
 
+static int
+rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_send_wr frmr_wr, *bad_wr;
+	u8 key;
+	int len, pageoff;
+	int i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
+		__func__, seg1->mr_chunk.rl_mw, i);
+
+	/* Bump the key */
+	key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+
+	/* Prepare FRMR WR */
+	memset(&frmr_wr, 0, sizeof frmr_wr);
+	frmr_wr.opcode = IB_WR_FAST_REG_MR;
+	frmr_wr.send_flags = 0;			/* unsignaled */
+	frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
+	frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
+	frmr_wr.wr.fast_reg.page_list_len = i;
+	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
+	frmr_wr.wr.fast_reg.access_flags = (writing ?
+				IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
+	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
+
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_post_send for register,"
+			" status %i\n", __func__, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+		seg1->mr_base = seg1->mr_dma + pageoff;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc;
+
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+
+	memset(&invalidate_wr, 0, sizeof invalidate_wr);
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.send_flags = 0;			/* unsignaled */
+	invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
 static int
 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
 			int *nsegs, int writing, struct rpcrdma_ia *ia)
@@ -1600,6 +1756,11 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 		break;
 #endif
 
+	/* Registration using frmr registration */
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
+		break;
+
 	/* Registration using fmr memory registration */
 	case RPCRDMA_MTHCAFMR:
 		rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
@@ -1639,6 +1800,10 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 		break;
 #endif
 
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
+		break;
+
 	case RPCRDMA_MTHCAFMR:
 		rc = rpcrdma_deregister_fmr_external(seg, ia);
 		break;
-- 
cgit v1.2.3


From b334eaabf4f92226d2df13c613888a507f03da99 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Thu, 9 Oct 2008 15:00:30 -0400
Subject: RPC/RDMA: fix connection IRD/ORD setting

This logic sets the connection parameter that configures the local device
and informs the remote peer how many concurrent incoming RDMA_READ
requests are supported. The original logic didn't really do what was
intended for two reasons:

- The max number supported by the device is typically smaller than
any one factor in the calculation used, and

- The field in the connection parameter structure where the value is
stored is a u8 and always overflows for the default settings.

So what really happens is the value requested for responder resources
is the left over 8 bits from the "desired value". If the desired value
happened to be a multiple of 256, the result was zero and it wouldn't
connect at all.

Given the above and the fact that max_requests is almost always larger
than the max responder resources supported by the adapter, this patch
simplifies this logic and simply requests the max supported by the device,
subject to a reasonable limit.

This bug was found by Jim Schutt at Sandia.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Acked-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 51 +++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 39a165202d8..e3fe9054fef 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -705,30 +705,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_remote_cma.private_data_len = 0;
 
 	/* Client offers RDMA Read but does not initiate */
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_BOUNCEBUFFERS:
+	ep->rep_remote_cma.initiator_depth = 0;
+	if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
 		ep->rep_remote_cma.responder_resources = 0;
-		break;
-	case RPCRDMA_MTHCAFMR:
-	case RPCRDMA_REGISTER:
-	case RPCRDMA_FRMR:
-		ep->rep_remote_cma.responder_resources = cdata->max_requests *
-				(RPCRDMA_MAX_DATA_SEGS / 8);
-		break;
-	case RPCRDMA_MEMWINDOWS:
-	case RPCRDMA_MEMWINDOWS_ASYNC:
-#if RPCRDMA_PERSISTENT_REGISTRATION
-	case RPCRDMA_ALLPHYSICAL:
-#endif
-		ep->rep_remote_cma.responder_resources = cdata->max_requests *
-				(RPCRDMA_MAX_DATA_SEGS / 2);
-		break;
-	default:
-		break;
-	}
-	if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
+	else if (devattr.max_qp_rd_atom > 32)	/* arbitrary but <= 255 */
+		ep->rep_remote_cma.responder_resources = 32;
+	else
 		ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
-	ep->rep_remote_cma.initiator_depth = 0;
 
 	ep->rep_remote_cma.retry_count = 7;
 	ep->rep_remote_cma.flow_control = 0;
@@ -858,14 +841,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
 	}
 }
 
-	/* Theoretically a client initiator_depth > 0 is not needed,
-	 * but many peers fail to complete the connection unless they
-	 * == responder_resources! */
-	if (ep->rep_remote_cma.initiator_depth !=
-				ep->rep_remote_cma.responder_resources)
-		ep->rep_remote_cma.initiator_depth =
-			ep->rep_remote_cma.responder_resources;
-
 	ep->rep_connected = 0;
 
 	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -894,14 +869,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
 	if (ep->rep_connected <= 0) {
 		/* Sometimes, the only way to reliably connect to remote
 		 * CMs is to use same nonzero values for ORD and IRD. */
-		ep->rep_remote_cma.initiator_depth =
-					ep->rep_remote_cma.responder_resources;
-		if (ep->rep_remote_cma.initiator_depth == 0)
-			++ep->rep_remote_cma.initiator_depth;
-		if (ep->rep_remote_cma.responder_resources == 0)
-			++ep->rep_remote_cma.responder_resources;
-		if (retry_count++ == 0)
+		if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
+		    (ep->rep_remote_cma.responder_resources == 0 ||
+		     ep->rep_remote_cma.initiator_depth !=
+				ep->rep_remote_cma.responder_resources)) {
+			if (ep->rep_remote_cma.responder_resources == 0)
+				ep->rep_remote_cma.responder_resources = 1;
+			ep->rep_remote_cma.initiator_depth =
+				ep->rep_remote_cma.responder_resources;
 			goto retry;
+		}
 		rc = ep->rep_connected;
 	} else {
 		dprintk("RPC:       %s: connected\n", __func__);
-- 
cgit v1.2.3


From ba9e64b1c23f1dd22fea14c310f739d84ac8b748 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 10 Oct 2008 12:10:30 -0700
Subject: gre: fix copy and paste error

The flags are dumped twice, the keys not at all.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0d5e35b0ed5..c0755e98b81 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1539,8 +1539,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
-	NLA_PUT_BE32(skb, IFLA_GRE_IFLAGS, p->i_flags);
-	NLA_PUT_BE32(skb, IFLA_GRE_OFLAGS, p->o_flags);
+	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
+	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
 	NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr);
 	NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr);
 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
-- 
cgit v1.2.3


From 575448bd36208f99fe0dd554a43518d798966740 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:00:40 -0400
Subject: RPC/RDMA: suppress retransmit on RPC/RDMA clients.

An RPC/RDMA client cannot retransmit on an unbroken connection,
doing so violates its flow control with the server.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |  2 ++
 net/sunrpc/xprtrdma/transport.c | 16 ++++++++++++----
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e55427f73df..721dae795d6 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -681,6 +681,8 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
 	struct rpc_xprt *xprt = ep->rep_xprt;
 
 	spin_lock_bh(&xprt->transport_lock);
+	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
+		++xprt->connect_cookie;
 	if (ep->rep_connected > 0) {
 		if (!xprt_test_and_set_connected(xprt))
 			xprt_wake_pending_tasks(xprt, 0);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 89970b0a4cc..0aefc648538 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -587,6 +587,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
 	}
 	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
 out:
+	req->rl_connect_cookie = 0;	/* our reserved value */
 	return req->rl_xdr_buf;
 
 outfail:
@@ -690,13 +691,20 @@ xprt_rdma_send_request(struct rpc_task *task)
 		req->rl_reply->rr_xprt = xprt;
 	}
 
-	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
-		xprt_disconnect_done(xprt);
-		return -ENOTCONN;	/* implies disconnect */
-	}
+	/* Must suppress retransmit to maintain credits */
+	if (req->rl_connect_cookie == xprt->connect_cookie)
+		goto drop_connection;
+	req->rl_connect_cookie = xprt->connect_cookie;
+
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+		goto drop_connection;
 
 	rqst->rq_bytes_sent = 0;
 	return 0;
+
+drop_connection:
+	xprt_disconnect_done(xprt);
+	return -ENOTCONN;	/* implies disconnect */
 }
 
 static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 05b7898e1f4..2db2344d487 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -181,6 +181,7 @@ struct rpcrdma_req {
 	size_t 		rl_size;	/* actual length of buffer */
 	unsigned int	rl_niovs;	/* 0, 2 or 4 */
 	unsigned int	rl_nchunks;	/* non-zero if chunks */
+	unsigned int	rl_connect_cookie;	/* retry detection */
 	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
 	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
 	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
-- 
cgit v1.2.3


From ad0e9e01da4ece70ff524b49c77c5e850d5dd53e Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:00:50 -0400
Subject: RPC/RDMA: maintain the RPC task bytes-sent statistic.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/transport.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 0aefc648538..ec6d1e7a194 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -699,6 +699,7 @@ xprt_rdma_send_request(struct rpc_task *task)
 	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
 		goto drop_connection;
 
+	task->tk_bytes_sent += rqst->rq_snd_buf.len;
 	rqst->rq_bytes_sent = 0;
 	return 0;
 
-- 
cgit v1.2.3


From 4d74f8ba1fb152ae07eb858abb713e094e77b7d5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 10 Oct 2008 12:11:06 -0700
Subject: gre: minor cleanups in netlink interface

- use typeful helpers for IFLA_GRE_LOCAL/IFLA_GRE_REMOTE
- replace magic value by FIELD_SIZEOF
- use MODULE_ALIAS_RTNL_LINK macro

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c0755e98b81..05ebce2881e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1368,10 +1368,10 @@ static void ipgre_netlink_parms(struct nlattr *data[],
 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
 
 	if (data[IFLA_GRE_LOCAL])
-		memcpy(&parms->iph.saddr, nla_data(data[IFLA_GRE_LOCAL]), 4);
+		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
 
 	if (data[IFLA_GRE_REMOTE])
-		memcpy(&parms->iph.daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
 
 	if (data[IFLA_GRE_TTL])
 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
@@ -1541,8 +1541,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
-	NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr);
-	NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr);
+	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
+	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
@@ -1559,8 +1559,8 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
-	[IFLA_GRE_LOCAL]	= { .len = 4 },
-	[IFLA_GRE_REMOTE]	= { .len = 4 },
+	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
@@ -1643,5 +1643,5 @@ static void __exit ipgre_fini(void)
 module_init(ipgre_init);
 module_exit(ipgre_fini);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("rtnl-link-gre");
-MODULE_ALIAS("rtnl-link-gretap");
+MODULE_ALIAS_RTNL_LINK("gre");
+MODULE_ALIAS_RTNL_LINK("gretap");
-- 
cgit v1.2.3


From fee08caf943e8ed3446ce42fa085b5e7e5f08d92 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:01:00 -0400
Subject: RPC/RDMA: avoid an oops due to disconnect racing with async upcalls.

RDMA disconnects yield an upcall from the RDMA connection manager,
which can race with rpc transport close, e.g. on ^C of a mount.
Ensure any rdma cm_id and qp are fully destroyed before continuing.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e3fe9054fef..d94f379f36d 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -565,6 +565,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	return 0;
 out2:
 	rdma_destroy_id(ia->ri_id);
+	ia->ri_id = NULL;
 out1:
 	return rc;
 }
@@ -585,15 +586,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
 		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
 			__func__, rc);
 	}
-	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
-		rdma_destroy_qp(ia->ri_id);
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+		if (ia->ri_id->qp)
+			rdma_destroy_qp(ia->ri_id);
+		rdma_destroy_id(ia->ri_id);
+		ia->ri_id = NULL;
+	}
 	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
 		rc = ib_dealloc_pd(ia->ri_pd);
 		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
 			__func__, rc);
 	}
-	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
-		rdma_destroy_id(ia->ri_id);
 }
 
 /*
@@ -751,21 +754,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 		if (rc)
 			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
 				" returned %i\n", __func__, rc);
+		rdma_destroy_qp(ia->ri_id);
+		ia->ri_id->qp = NULL;
 	}
 
-	ep->rep_func = NULL;
-
 	/* padding - could be done in rpcrdma_buffer_destroy... */
 	if (ep->rep_pad_mr) {
 		rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
 		ep->rep_pad_mr = NULL;
 	}
 
-	if (ia->ri_id->qp) {
-		rdma_destroy_qp(ia->ri_id);
-		ia->ri_id->qp = NULL;
-	}
-
 	rpcrdma_clean_cq(ep->rep_cq);
 	rc = ib_destroy_cq(ep->rep_cq);
 	if (rc)
-- 
cgit v1.2.3


From 9191ca3b381b15b9a88785a8ae2fa4db8e553b0c Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:01:11 -0400
Subject: RPC/RDMA: adhere to protocol for unpadded client trailing write
 chunks.

The RPC/RDMA protocol allows clients and servers to avoid RDMA
operations for data which is purely the result of XDR padding.
On the client, automatically insert the necessary padding for
such server replies, and optionally don't marshal such chunks.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  | 21 +++++++++++++++++++--
 net/sunrpc/xprtrdma/transport.c |  9 +++++++++
 net/sunrpc/xprtrdma/xprt_rdma.h |  5 +++++
 3 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 721dae795d6..d245c0bf787 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 	}
 
 	if (xdrbuf->tail[0].iov_len) {
+		/* the rpcrdma protocol allows us to omit any trailing
+		 * xdr pad bytes, saving the server an RDMA operation. */
+		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+			return n;
 		if (n == nsegs)
 			return 0;
 		seg[n].mr_page = NULL;
@@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
  * Scatter inline received data back into provided iov's.
  */
 static void
-rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 {
 	int i, npages, curlen, olen;
 	char *destp;
@@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
 	} else
 		rqst->rq_rcv_buf.tail[0].iov_len = 0;
 
+	if (pad) {
+		/* implicit padding on terminal chunk */
+		unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+		while (pad--)
+			p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+	}
+
 	if (copy_len)
 		dprintk("RPC:       %s: %d bytes in"
 			" %d extra segments (%d lost)\n",
@@ -794,14 +805,20 @@ repost:
 			    ((unsigned char *)iptr - (unsigned char *)headerp);
 			status = rep->rr_len + rdmalen;
 			r_xprt->rx_stats.total_rdma_reply += rdmalen;
+			/* special case - last chunk may omit padding */
+			if (rdmalen &= 3) {
+				rdmalen = 4 - rdmalen;
+				status += rdmalen;
+			}
 		} else {
 			/* else ordinary inline */
+			rdmalen = 0;
 			iptr = (__be32 *)((unsigned char *)headerp + 28);
 			rep->rr_len -= 28; /*sizeof *headerp;*/
 			status = rep->rr_len;
 		}
 		/* Fix up the rpc results for upper layer */
-		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
 		break;
 
 	case __constant_htonl(RDMA_NOMSG):
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ec6d1e7a194..c7d2380bb5e 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -71,6 +71,7 @@ static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+                int xprt_rdma_pad_optimize = 0;
 
 #ifdef RPC_DEBUG
 
@@ -135,6 +136,14 @@ static ctl_table xr_tunables_table[] = {
 		.extra1		= &min_memreg,
 		.extra2		= &max_memreg,
 	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname	= "rdma_pad_optimize",
+		.data		= &xprt_rdma_pad_optimize,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name = 0,
 	},
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2db2344d487..fde6499a53b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -280,6 +280,11 @@ struct rpcrdma_xprt {
 #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
 #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
 
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
 /*
  * Interface Adapter calls - xprtrdma/verbs.c
  */
-- 
cgit v1.2.3


From 926449ba66ce2a45c619bbe755b00d6bdbf0d83e Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:01:21 -0400
Subject: RPC/RDMA: return a consistent error, when connect fails.

The xprt_connect call path does not expect such errors as ECONNREFUSED
to be returned from failed transport connection attempts, otherwise it
translates them to EIO and signals fatal errors. For example, mount.nfs
prints simply "internal error". Translate all such errors to ENOTCONN
from RPC/RDMA to match sockets behavior.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index d245c0bf787..94ecf1b65ff 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -699,7 +699,7 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
 			xprt_wake_pending_tasks(xprt, 0);
 	} else {
 		if (xprt_test_and_clear_connected(xprt))
-			xprt_wake_pending_tasks(xprt, ep->rep_connected);
+			xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
-- 
cgit v1.2.3


From 1a954051b0cf79bd67e5f9db40333e3a9b1d05d2 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:01:31 -0400
Subject: RPC/RDMA: fix connect/reconnect resource leak.

The RPC/RDMA code can leak RDMA connection manager endpoints in
certain error cases on connect. Don't signal unwanted events,
and be certain to destroy any allocated qp.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index d94f379f36d..a63d0c0ec01 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -338,10 +338,8 @@ connected:
 		wake_up_all(&ep->rep_connect_wait);
 		break;
 	default:
-		ia->ri_async_rc = -EINVAL;
-		dprintk("RPC:       %s: unexpected CM event %X\n",
+		dprintk("RPC:       %s: unexpected CM event %d\n",
 			__func__, event->event);
-		complete(&ia->ri_done);
 		break;
 	}
 
@@ -355,6 +353,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	struct rdma_cm_id *id;
 	int rc;
 
+	init_completion(&ia->ri_done);
+
 	id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
 	if (IS_ERR(id)) {
 		rc = PTR_ERR(id);
@@ -427,8 +427,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	struct ib_device_attr devattr;
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 
-	init_completion(&ia->ri_done);
-
 	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
 	if (IS_ERR(ia->ri_id)) {
 		rc = PTR_ERR(ia->ri_id);
@@ -815,6 +813,7 @@ retry:
 			goto out;
 		}
 		/* END TEMP */
+		rdma_destroy_qp(ia->ri_id);
 		rdma_destroy_id(ia->ri_id);
 		ia->ri_id = id;
 	}
-- 
cgit v1.2.3


From 5675add36e76b9487e7f9e689f854cb8d6afd9b4 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:01:41 -0400
Subject: RPC/RDMA: harden connection logic against missing/late rdma_cm
 upcalls.

Add defensive timeouts to wait_for_completion() calls in RDMA
address resolution, and make them interruptible. Fix the timeout
units to milliseconds (formerly jiffies) and move to private header.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c     | 11 +++++++----
 net/sunrpc/xprtrdma/xprt_rdma.h |  3 +++
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index a63d0c0ec01..f46fb93f421 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ia->ri_async_rc = 0;
 		complete(&ia->ri_done);
 		break;
 	case RDMA_CM_EVENT_ADDR_ERROR:
@@ -363,26 +364,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 		return id;
 	}
 
-	ia->ri_async_rc = 0;
+	ia->ri_async_rc = -ETIMEDOUT;
 	rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
 	if (rc) {
 		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
 			__func__, rc);
 		goto out;
 	}
-	wait_for_completion(&ia->ri_done);
+	wait_for_completion_interruptible_timeout(&ia->ri_done,
+				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 	rc = ia->ri_async_rc;
 	if (rc)
 		goto out;
 
-	ia->ri_async_rc = 0;
+	ia->ri_async_rc = -ETIMEDOUT;
 	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
 	if (rc) {
 		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
 			__func__, rc);
 		goto out;
 	}
-	wait_for_completion(&ia->ri_done);
+	wait_for_completion_interruptible_timeout(&ia->ri_done,
+				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 	rc = ia->ri_async_rc;
 	if (rc)
 		goto out;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index fde6499a53b..c7a7eba991b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -51,6 +51,9 @@
 #include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
 #include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
 
+#define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
+
 /*
  * Interface Adapter -- one per transport instance
  */
-- 
cgit v1.2.3


From 5f37d561e0f0cd98017c389cbc22080290f11c3c Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:01:52 -0400
Subject: RPC/RDMA: reformat a debug printk to keep lines together.

The send marshaling code split a particular dprintk across two
lines, which makes it hard to extract from logfiles.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 94ecf1b65ff..15101f294e0 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -512,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	if (hdrlen == 0)
 		return -1;
 
-	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd\n"
-		"                   headerp 0x%p base 0x%p lkey 0x%x\n",
+	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+		" headerp 0x%p base 0x%p lkey 0x%x\n",
 		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
 		headerp, base, req->rl_iov.lkey);
 
-- 
cgit v1.2.3


From b3cd8d45a764e6edb06e7bd386faf99a879569b8 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:02:02 -0400
Subject: RPC/RDMA: optionally emit useful transport info upon
 connect/disconnect.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/transport.c |  2 +-
 net/sunrpc/xprtrdma/verbs.c     | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index c7d2380bb5e..c2da680273c 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -784,7 +784,7 @@ static void __exit xprt_rdma_cleanup(void)
 {
 	int rc;
 
-	dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+	dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
 #ifdef RPC_DEBUG
 	if (sunrpc_table_header) {
 		unregister_sysctl_table(sunrpc_table_header);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f46fb93f421..170e69cba6c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -344,6 +344,27 @@ connected:
 		break;
 	}
 
+#ifdef RPC_DEBUG
+	if (connstate == 1) {
+		int ird = attr.max_dest_rd_atomic;
+		int tird = ep->rep_remote_cma.responder_resources;
+		printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
+			"on %s, memreg %d slots %d ird %d%s\n",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port),
+			ia->ri_id->device->name,
+			ia->ri_memreg_strategy,
+			xprt->rx_buf.rb_max_requests,
+			ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
+	} else if (connstate < 0) {
+		printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
+			"closed (%d)\n",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port),
+			connstate);
+	}
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 08ca0dce1eafa419059ac4cad9ed522af7052526 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Fri, 10 Oct 2008 11:32:34 -0400
Subject: RPC/RDMA: correct the reconnect timer backoff

The RPC/RDMA code had a constant 5-second reconnect backoff, and
always performed it, even when re-establishing a connection to a
server after the RPC layer closed it due to being idle. Make it
an geometric backoff (up to 30 seconds), and don't delay idle
reconnect.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/transport.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index c2da680273c..9839c3d9414 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -463,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
 	dprintk("RPC:       %s: closing\n", __func__);
+	if (r_xprt->rx_ep.rep_connected > 0)
+		xprt->reestablish_timeout = 0;
 	xprt_disconnect_done(xprt);
 	(void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
 }
@@ -490,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task)
 			/* Reconnect */
 			schedule_delayed_work(&r_xprt->rdma_connect,
 				xprt->reestablish_timeout);
+			xprt->reestablish_timeout <<= 1;
+			if (xprt->reestablish_timeout > (30 * HZ))
+				xprt->reestablish_timeout = (30 * HZ);
+			else if (xprt->reestablish_timeout < (5 * HZ))
+				xprt->reestablish_timeout = (5 * HZ);
 		} else {
 			schedule_delayed_work(&r_xprt->rdma_connect, 0);
 			if (!RPC_IS_ASYNC(task))
-- 
cgit v1.2.3


From c055551e97e1ca00781bc41523f829e05a8afed7 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Fri, 10 Oct 2008 11:32:45 -0400
Subject: RPC/RDMA: ensure connection attempt is complete before signalling.

The RPC/RDMA connection logic could return early from reconnection
attempts, leading to additional spurious retries.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 170e69cba6c..a5fef5e6c32 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -804,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	struct rdma_cm_id *id;
 	int rc = 0;
 	int retry_count = 0;
-	int reconnect = (ep->rep_connected != 0);
 
-	if (reconnect) {
+	if (ep->rep_connected != 0) {
 		struct rpcrdma_xprt *xprt;
 retry:
 		rc = rpcrdma_ep_disconnect(ep, ia);
@@ -871,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
 		goto out;
 	}
 
-	if (reconnect)
-		return 0;
-
 	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
 
 	/*
-- 
cgit v1.2.3


From 1839faab9a2747bcd30ee14e50575a39bf6735d4 Mon Sep 17 00:00:00 2001
From: Tobias Brunner <tobias.brunner@strongswan.org>
Date: Fri, 10 Oct 2008 14:07:03 -0700
Subject: af_key: fix SADB_X_SPDDELETE response

When deleting an SPD entry using SADB_X_SPDDELETE, c.data.byid is not
initialized to zero in pfkey_spddelete(). Thus, key_notify_policy()
responds with a PF_KEY message of type SADB_X_SPDDELETE2 instead of
SADB_X_SPDDELETE.

Signed-off-by: Tobias Brunner <tobias.brunner@strongswan.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 362fe317e1f..e55e0441e4d 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2341,6 +2341,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
+	c.data.byid = 0;
 	c.event = XFRM_MSG_DELPOLICY;
 	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
 
-- 
cgit v1.2.3


From f901b64472fdabc72eca2b9426fa4e96972b64c4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Oct 2008 12:18:04 -0700
Subject: ipvs: Add proper dependencies on IP_VS, and fix description header
 line.

Linus noted a build failure case:

net/netfilter/ipvs/ip_vs_xmit.c: In function 'ip_vs_tunnel_xmit':
net/netfilter/ipvs/ip_vs_xmit.c:616: error: implicit declaration of function 'ip_select_ident'

The proper include file (net/ip.h) is being included in ip_vs_xmit.c to get
that declaration.  So the only possible case where this can happen is if
CONFIG_INET is not enabled.

This seems to be purely a missing dependency in the ipvs/Kconfig file IP_VS
entry.

Also, while we're here, remove the out of date "EXPERIMENTAL" string in the
IP_VS config help header line.  IP_VS no longer depends upon CONFIG_EXPERIMENTAL

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/ipvs/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index de6004de80b..05048e40326 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -2,8 +2,8 @@
 # IP Virtual Server configuration
 #
 menuconfig IP_VS
-	tristate "IP virtual server support (EXPERIMENTAL)"
-	depends on NETFILTER
+	tristate "IP virtual server support"
+	depends on NET && INET && NETFILTER
 	---help---
 	  IP Virtual Server support will let you build a high-performance
 	  virtual server based on cluster of two or more real servers. This
-- 
cgit v1.2.3


From 7bb82d924536cfa62db73dd381b07d9e9b084ffa Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 11 Oct 2008 12:20:15 -0700
Subject: gre: Initialise rtnl_link tunnel parameters properly

Brown paper bag error of calling memset with sizeof(p) instead
of sizeof(*p).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 05ebce2881e..85c487b8572 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1345,7 +1345,7 @@ out:
 static void ipgre_netlink_parms(struct nlattr *data[],
 				struct ip_tunnel_parm *parms)
 {
-	memset(parms, 0, sizeof(parms));
+	memset(parms, 0, sizeof(*parms));
 
 	parms->iph.protocol = IPPROTO_GRE;
 
-- 
cgit v1.2.3


From ff71268aa4e9d961643c5e0ea5e14a3dd6d27f28 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sun, 12 Oct 2008 21:03:38 -0700
Subject: wireless: remove duplicated #include

Removed duplicated include <linux/list.h> in
net/wireless/core.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/wireless/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5cadbeb76a1..24fdd4cd22c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -13,7 +13,6 @@
 #include <linux/debugfs.h>
 #include <linux/notifier.h>
 #include <linux/device.h>
-#include <linux/list.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>
 #include <net/wireless.h>
-- 
cgit v1.2.3


From ab396eb03f33a2e2afb7b0603a43929bf5857c45 Mon Sep 17 00:00:00 2001
From: Dimitris Michailidis <dm@chelsio.com>
Date: Sun, 12 Oct 2008 21:07:34 -0700
Subject: net: Fix off-by-one in skb_dma_map

The unwind loop iterates down to -1 instead of stopping at 0 and ends up
accessing ->frags[-1].

Signed-off-by: Dimitris Michailidis <dm@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skb_dma_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c
index 1f49afcd8e8..86234923a3b 100644
--- a/net/core/skb_dma_map.c
+++ b/net/core/skb_dma_map.c
@@ -35,7 +35,7 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb,
 	return 0;
 
 unwind:
-	while (i-- >= 0) {
+	while (--i >= 0) {
 		skb_frag_t *fp = &sp->frags[i];
 
 		dma_unmap_page(dev, sp->dma_maps[i + 1],
-- 
cgit v1.2.3


From 14717f811b6662ca77bf39c07f5589c3b084f942 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sun, 12 Oct 2008 21:08:34 -0700
Subject: netfilter: remove unused #include <version.h>

The file(s) below do not use LINUX_VERSION_CODE nor KERNEL_VERSION.
  net/netfilter/nf_tproxy_core.c

This patch removes the said #include <version.h>.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_tproxy_core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index fe34f4bf74c..cdc97f3105a 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -10,7 +10,6 @@
  *
  */
 
-#include <linux/version.h>
 #include <linux/module.h>
 
 #include <linux/net.h>
-- 
cgit v1.2.3


From bf94e17bc8d35fc339945a42990a2f2b5e9b5a40 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 12 Oct 2008 23:51:38 -0700
Subject: net/mac80211/rx.c: fix build error

older versions of gcc do not recognize that ieee80211_rx_h_mesh_fwding()
is unused when CONFIG_MAC80211_MESH is disabled:

  net/built-in.o: In function `ieee80211_rx_h_mesh_fwding':
  rx.c:(.text+0xd89af): undefined reference to `mpp_path_lookup'
  rx.c:(.text+0xd89c6): undefined reference to `mpp_path_add'

as this code construct:

        if (ieee80211_vif_is_mesh(&sdata->vif))
                CALL_RXH(ieee80211_rx_h_mesh_fwding);

still causes ieee80211_rx_h_mesh_fwding() to be linked in.

Protect these places with an #ifdef.

commit b0dee578 ("Fix modpost failure when rx handlers are not inlined.")
solved part of this problem - this patch is still needed.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mac80211/rx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 77e7b014872..cf6b121e1bb 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1379,6 +1379,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 	return RX_QUEUED;
 }
 
+#ifdef CONFIG_MAC80211_MESH
 static ieee80211_rx_result
 ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 {
@@ -1453,7 +1454,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 	else
 		return RX_DROP_MONITOR;
 }
-
+#endif
 
 static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
@@ -1780,8 +1781,10 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
 	/* must be after MMIC verify so header is counted in MPDU mic */
 	CALL_RXH(ieee80211_rx_h_remove_qos_control)
 	CALL_RXH(ieee80211_rx_h_amsdu)
+#ifdef CONFIG_MAC80211_MESH
 	if (ieee80211_vif_is_mesh(&sdata->vif))
 		CALL_RXH(ieee80211_rx_h_mesh_fwding);
+#endif
 	CALL_RXH(ieee80211_rx_h_data)
 	CALL_RXH(ieee80211_rx_h_ctrl)
 	CALL_RXH(ieee80211_rx_h_action)
-- 
cgit v1.2.3


From a447c0932445f92ce6f4c1bd020f62c5097a7842 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 13 Oct 2008 10:46:57 +0100
Subject: vfs: Use const for kernel parser table

This is a much better version of a previous patch to make the parser
tables constant. Rather than changing the typedef, we put the "const" in
all the various places where its required, allowing the __initconst
exception for nfsroot which was the cause of the previous trouble.

This was posted for review some time ago and I believe its been in -mm
since then.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Alexander Viro <aviro@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/9p/client.c   | 2 +-
 net/9p/trans_fd.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/9p/client.c b/net/9p/client.c
index 10e320307ec..e053e06028a 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -52,7 +52,7 @@ enum {
 	Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_msize, "msize=%u"},
 	{Opt_legacy, "noextend"},
 	{Opt_trans, "trans=%s"},
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index d652baf5ff9..6dabbdb6665 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -86,7 +86,7 @@ enum {
 	Opt_port, Opt_rfdno, Opt_wfdno, Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_port, "port=%u"},
 	{Opt_rfdno, "rfdno=%u"},
 	{Opt_wfdno, "wfdno=%u"},
-- 
cgit v1.2.3


From b4bb4ac8cb05ab5c13dfb7b47ef243982d3ad526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Mon, 13 Oct 2008 18:43:59 -0700
Subject: pktgen: fix skb leak in case of failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Seems that skb goes into void unless something magic happened
in pskb_expand_head in case of failure.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index a756847e381..99f656d35b4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2474,7 +2474,7 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev,
 				if (ret < 0) {
 					printk(KERN_ERR "Error expanding "
 					       "ipsec packet %d\n",ret);
-					return 0;
+					goto err;
 				}
 			}
 
@@ -2484,8 +2484,7 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev,
 			if (ret) {
 				printk(KERN_ERR "Error creating ipsec "
 				       "packet %d\n",ret);
-				kfree_skb(skb);
-				return 0;
+				goto err;
 			}
 			/* restore ll */
 			eth = (__u8 *) skb_push(skb, ETH_HLEN);
@@ -2494,6 +2493,9 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev,
 		}
 	}
 	return 1;
+err:
+	kfree_skb(skb);
+	return 0;
 }
 #endif
 
-- 
cgit v1.2.3


From e7dc849494608fca7a7493c07eb190219c00d064 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 13 Oct 2008 18:54:07 -0700
Subject: netns: mib6 section fixlet

  LD      net/ipv6/ipv6.o
WARNING: net/ipv6/ipv6.o(.text+0xd8): Section mismatch in reference from the function inet6_net_init() to the function .init.text:ipv6_init_mibs()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 050e14b7f70..01edac88851 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -834,7 +834,7 @@ static void __net_exit ipv6_cleanup_mibs(struct net *net)
 	snmp_mib_free((void **)net->mib.icmpv6msg_statistics);
 }
 
-static int inet6_net_init(struct net *net)
+static int __net_init inet6_net_init(struct net *net)
 {
 	int err = 0;
 
-- 
cgit v1.2.3


From 510149e31974fdbb2c00c9bee6c0e2a688e61c85 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 13 Oct 2008 18:58:48 -0700
Subject: dsa: fix compile bug on s390

git commit 45cec1bac0719c904bb5f4405c2937f7e715888c
"dsa: Need to select PHYLIB." causes this build bug on s390:

drivers/built-in.o: In function `phy_stop_interrupts':
/home/heicarst/linux-2.6/drivers/net/phy/phy.c:631: undefined reference to `free_irq'
/home/heicarst/linux-2.6/drivers/net/phy/phy.c:646: undefined reference to `enable_irq'
drivers/built-in.o: In function `phy_start_interrupts':
/home/heicarst/linux-2.6/drivers/net/phy/phy.c:601: undefined reference to `request_irq'
drivers/built-in.o: In function `phy_interrupt':
/home/heicarst/linux-2.6/drivers/net/phy/phy.c:528: undefined reference to `disable_irq_nosync'
drivers/built-in.o: In function `phy_change':
/home/heicarst/linux-2.6/drivers/net/phy/phy.c:674: undefined reference to `enable_irq'
/home/heicarst/linux-2.6/drivers/net/phy/phy.c:692: undefined reference to `disable_irq'

PHYLIB has alread a depend on !S390, however select PHYLIB at DSA overrides
that unfortunately. So add a depend on !S390 to DSA as well.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index cdce4c6c672..49211b35725 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,7 +1,7 @@
 menuconfig NET_DSA
 	bool "Distributed Switch Architecture support"
 	default n
-	depends on EXPERIMENTAL
+	depends on EXPERIMENTAL && !S390
 	select PHYLIB
 	---help---
 	  This allows you to use hardware switch chips that use
-- 
cgit v1.2.3


From 113aa838ec3a235d883f8357d31d90e16c47fc89 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Mon, 13 Oct 2008 19:01:08 -0700
Subject: net: Rationalise email address: Network Specific Parts

Clean up the various different email addresses of mine listed in the code
to a single current and valid address. As Dave says his network merges
for 2.6.28 are now done this seems a good point to send them in where
they won't risk disrupting real changes.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/psnap.c          | 2 +-
 net/appletalk/ddp.c      | 4 ++--
 net/core/datagram.c      | 2 +-
 net/core/dev_mcast.c     | 2 +-
 net/core/skbuff.c        | 2 +-
 net/core/stream.c        | 2 +-
 net/ipv4/icmp.c          | 2 +-
 net/ipv4/igmp.c          | 2 +-
 net/ipv4/ip_fragment.c   | 2 +-
 net/ipv4/ip_input.c      | 2 +-
 net/ipv4/ipip.c          | 2 +-
 net/ipv4/ipmr.c          | 2 +-
 net/ipv4/udp.c           | 2 +-
 net/netlink/af_netlink.c | 2 +-
 net/sunrpc/xprtsock.c    | 4 ++--
 net/unix/af_unix.c       | 2 +-
 16 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/802/psnap.c b/net/802/psnap.c
index b3cfe5a14fc..70980baeb68 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -1,7 +1,7 @@
 /*
  *	SNAP data link layer. Derived from 802.2
  *
- *		Alan Cox <Alan.Cox@linux.org>,
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>,
  *		from the 802.2 layer by Greg Page.
  *		Merged in additions from Greg Page's psnap.c.
  *
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 0c850427a85..d3134e7e6ee 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -2,7 +2,7 @@
  *	DDP:	An implementation of the AppleTalk DDP protocol for
  *		Ethernet 'ELAP'.
  *
- *		Alan Cox  <Alan.Cox@linux.org>
+ *		Alan Cox  <alan@lxorguk.ukuu.org.uk>
  *
  *		With more than a little assistance from
  *
@@ -1934,6 +1934,6 @@ static void __exit atalk_exit(void)
 module_exit(atalk_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Alan Cox <Alan.Cox@linux.org>");
+MODULE_AUTHOR("Alan Cox <alan@lxorguk.ukuu.org.uk>");
 MODULE_DESCRIPTION("AppleTalk 0.20\n");
 MODULE_ALIAS_NETPROTO(PF_APPLETALK);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 52f577a0f54..ee631843c2f 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -9,7 +9,7 @@
  *	identical recvmsg() code. So we share it here. The poll was
  *	shared before but buried in udp.c so I moved it.
  *
- *	Authors:	Alan Cox <alan@redhat.com>. (datagram_poll() from old
+ *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
  *						     udp.c code)
  *
  *	Fixes:
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 5402b3b38e0..9e2fa39f22a 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -6,7 +6,7 @@
  *		Richard Underwood <richard@wuzz.demon.co.uk>
  *
  *	Stir fried together from the IP multicast and CAP patches above
- *		Alan Cox <Alan.Cox@linux.org>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
  *	Fixes:
  *		Alan Cox	:	Update the device on a real delete
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7f7bb1a636d..4e22e3a3535 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1,7 +1,7 @@
 /*
  *	Routines having to do with the 'struct sk_buff' memory handlers.
  *
- *	Authors:	Alan Cox <iiitac@pyr.swan.ac.uk>
+ *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
  *			Florian La Roche <rzsfl@rz.uni-sb.de>
  *
  *	Fixes:
diff --git a/net/core/stream.c b/net/core/stream.c
index a6b3437ff08..8727cead64a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -9,7 +9,7 @@
  *
  *     Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
  *                     (from old tcp.c code)
- *                     Alan Cox <alan@redhat.com> (Borrowed comments 8-))
+ *                     Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
  */
 
 #include <linux/module.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 55c355e6323..72b2de76f1c 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1,7 +1,7 @@
 /*
  *	NET3:	Implementation of the ICMP protocol layer.
  *
- *		Alan Cox, <alan@redhat.com>
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7f9e337e390..a0d86455c53 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -9,7 +9,7 @@
  *	seems to fall out with gcc 2.6.2.
  *
  *	Authors:
- *		Alan Cox <Alan.Cox@linux.org>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 2152d222b95..e4f81f54bef 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -6,7 +6,7 @@
  *		The IP fragmentation functionality.
  *
  * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
- *		Alan Cox <Alan.Cox@linux.org>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
  * Fixes:
  *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e0bed56c51f..861978a4f1a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -8,7 +8,7 @@
  * Authors:	Ross Biro
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Donald Becker, <becker@super.org>
- *		Alan Cox, <Alan.Cox@linux.org>
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
  *		Richard Underwood
  *		Stefan Becker, <stefanb@yello.ping.de>
  *		Jorge Cwik, <jorge@laser.satlink.net>
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4c6d2caf920..29609d29df7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -41,7 +41,7 @@
 		Made the tunnels use dev->name not tunnel: when error reporting.
 		Added tx_dropped stat
 
-		-Alan Cox	(Alan.Cox@linux.org) 21 March 95
+		-Alan Cox	(alan@lxorguk.ukuu.org.uk) 21 March 95
 
 	Reworked:
 		Changed to tunnel to destination gateway in addition to the
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c519b8d30ee..b42e082cc17 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1,7 +1,7 @@
 /*
  *	IP multicast routing support for mrouted 3.6/3.8
  *
- *		(c) 1995 Alan Cox, <alan@redhat.com>
+ *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
  *	  Linux Consultancy and Custom Driver Development
  *
  *	This program is free software; you can redistribute it and/or
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eacf4cfef14..2095abc3cab 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -8,7 +8,7 @@
  * Authors:	Ross Biro
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- *		Alan Cox, <Alan.Cox@linux.org>
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
  *		Hirokazu Takahashi, <taka@valinux.co.jp>
  *
  * Fixes:
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b0eacc0007c..2fd8afac5f7 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1,7 +1,7 @@
 /*
  * NETLINK      Kernel-user communication protocol.
  *
- * 		Authors:	Alan Cox <alan@redhat.com>
+ * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
  * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  *
  *		This program is free software; you can redistribute it and/or
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4486c59c3ac..9a288d5eea6 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3,8 +3,8 @@
  *
  * Client-side transport implementation for sockets.
  *
- * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
- * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
+ * TCP callback races fixes (C) 1998 Red Hat
+ * TCP send fixes (C) 1998 Red Hat
  * TCP NFS related read + write fixes
  *  (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
  *
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 015606b54d9..c647aab8d41 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1,7 +1,7 @@
 /*
  * NET4:	Implementation of BSD Unix domain sockets.
  *
- * Authors:	Alan Cox, <alan.cox@linux.org>
+ * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
-- 
cgit v1.2.3


From 56f26f7b78af36d0f048a9403084870d2ffb549f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 13 Oct 2008 21:59:03 +0200
Subject: net/rfkill/rfkill-input.c needs <linux/sched.h>

For some m68k configs, I get:

| net/rfkill/rfkill-input.c: In function 'rfkill_start':
| net/rfkill/rfkill-input.c:208: error: dereferencing pointer to incomplete type

As the incomplete type is `struct task_struct', including <linux/sched.h> fixes
it.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/rfkill/rfkill-input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rfkill/rfkill-input.c b/net/rfkill/rfkill-input.c
index e5b69556bb5..21124ec0a73 100644
--- a/net/rfkill/rfkill-input.c
+++ b/net/rfkill/rfkill-input.c
@@ -16,6 +16,7 @@
 #include <linux/workqueue.h>
 #include <linux/init.h>
 #include <linux/rfkill.h>
+#include <linux/sched.h>
 
 #include "rfkill-input.h"
 
-- 
cgit v1.2.3


From 38f7ac3eb7206ffd1201c14baba832d7e363de0a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 14 Oct 2008 11:56:59 -0700
Subject: netfilter: restore lost #ifdef guarding defrag exception

Nir Tzachar <nir.tzachar@gmail.com> reported a warning when sending
fragments over loopback with NAT:

[ 6658.338121] WARNING: at net/ipv4/netfilter/nf_nat_standalone.c:89 nf_nat_fn+0x33/0x155()

The reason is that defragmentation is skipped for already tracked connections.
This is wrong in combination with NAT and ip_conntrack actually had some ifdefs
to avoid this behaviour when NAT is compiled in.

The entire "optimization" may seem a bit silly, for now simply restoring the
lost #ifdef is the easiest solution until we can come up with something better.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_defrag_ipv4.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index aa2c50a180f..fa2d6b6fc3e 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -41,12 +41,13 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
 					  int (*okfn)(struct sk_buff *))
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
 	/* Previously seen (loopback)?  Ignore.  Do this before
 	   fragment check. */
 	if (skb->nfct)
 		return NF_ACCEPT;
 #endif
-
+#endif
 	/* Gather fragments. */
 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 		if (nf_ct_ipv4_gather_frags(skb,
-- 
cgit v1.2.3


From 129404a1f117c35c6224e020444fc27eb4479817 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 14 Oct 2008 11:57:33 -0700
Subject: netfilter: fix ebtables dependencies

Ingo Molnar reported a build error with ebtables:

ERROR: "ebt_register_table" [net/bridge/netfilter/ebtable_filter.ko] undefined!
ERROR: "ebt_do_table" [net/bridge/netfilter/ebtable_filter.ko] undefined!
ERROR: "ebt_unregister_table" [net/bridge/netfilter/ebtable_filter.ko] undefined!
ERROR: "ebt_register_table" [net/bridge/netfilter/ebtable_broute.ko] undefined!
ERROR: "ebt_do_table" [net/bridge/netfilter/ebtable_broute.ko] undefined!
ERROR: "ebt_unregister_table" [net/bridge/netfilter/ebtable_broute.ko] undefined!
make[1]: *** [__modpost] Error 1
make: *** [modules] Error 2

This reason is a missing dependencies that got lost during Kconfig cleanups.
Restore it.

Tested-by: Ingo Molnar <mingo@elte.hu>

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 366d3e9d51f..ba6f73eb06c 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -4,6 +4,7 @@
 
 menuconfig BRIDGE_NF_EBTABLES
 	tristate "Ethernet Bridge tables (ebtables) support"
+	depends on BRIDGE && BRIDGE_NETFILTER
 	select NETFILTER_XTABLES
 	help
 	  ebtables is a general, extensible frame/packet identification
-- 
cgit v1.2.3


From e6a7d3c04f8fe49099521e6dc9a46b0272381f2f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 14 Oct 2008 11:58:31 -0700
Subject: netfilter: ctnetlink: remove bogus module dependency between
 ctnetlink and nf_nat

This patch removes the module dependency between ctnetlink and
nf_nat by means of an indirect call that is initialized when
nf_nat is loaded. Now, nf_conntrack_netlink only requires
nf_conntrack and nfnetlink.

This patch puts nfnetlink_parse_nat_setup_hook into the
nf_conntrack_core to avoid dependencies between ctnetlink,
nf_conntrack_ipv4 and nf_conntrack_ipv6.

This patch also introduces the function ctnetlink_change_nat
that is only invoked from the creation path. Actually, the
nat handling cannot be invoked from the update path since
this is not allowed. By introducing this function, we remove
the useless nat handling in the update path and we avoid
deadlock-prone code.

This patch also adds the required EAGAIN logic for nfnetlink.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_nat_core.c     |  97 ++++++++++++++++++++++
 net/netfilter/nf_conntrack_core.c    |   7 ++
 net/netfilter/nf_conntrack_netlink.c | 151 ++++++++++++++---------------------
 net/netfilter/nfnetlink.c            |  12 ++-
 4 files changed, 174 insertions(+), 93 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 2ac9eaf1a8c..a65cf692359 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -584,6 +584,98 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
 	.flags		= NF_CT_EXT_F_PREALLOC,
 };
 
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
+	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 },
+	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 },
+};
+
+static int nfnetlink_parse_nat_proto(struct nlattr *attr,
+				     const struct nf_conn *ct,
+				     struct nf_nat_range *range)
+{
+	struct nlattr *tb[CTA_PROTONAT_MAX+1];
+	const struct nf_nat_protocol *npt;
+	int err;
+
+	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
+	if (err < 0)
+		return err;
+
+	npt = nf_nat_proto_find_get(nf_ct_protonum(ct));
+	if (npt->nlattr_to_range)
+		err = npt->nlattr_to_range(tb, range);
+	nf_nat_proto_put(npt);
+	return err;
+}
+
+static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
+	[CTA_NAT_MINIP]		= { .type = NLA_U32 },
+	[CTA_NAT_MAXIP]		= { .type = NLA_U32 },
+};
+
+static int
+nfnetlink_parse_nat(struct nlattr *nat,
+		    const struct nf_conn *ct, struct nf_nat_range *range)
+{
+	struct nlattr *tb[CTA_NAT_MAX+1];
+	int err;
+
+	memset(range, 0, sizeof(*range));
+
+	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[CTA_NAT_MINIP])
+		range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
+
+	if (!tb[CTA_NAT_MAXIP])
+		range->max_ip = range->min_ip;
+	else
+		range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
+
+	if (range->min_ip)
+		range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+	if (!tb[CTA_NAT_PROTO])
+		return 0;
+
+	err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+			  enum nf_nat_manip_type manip,
+			  struct nlattr *attr)
+{
+	struct nf_nat_range range;
+
+	if (nfnetlink_parse_nat(attr, ct, &range) < 0)
+		return -EINVAL;
+	if (nf_nat_initialized(ct, manip))
+		return -EEXIST;
+
+	return nf_nat_setup_info(ct, &range, manip);
+}
+#else
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+			  enum nf_nat_manip_type manip,
+			  struct nlattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 static int __net_init nf_nat_net_init(struct net *net)
 {
 	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
@@ -654,6 +746,9 @@ static int __init nf_nat_init(void)
 
 	BUG_ON(nf_nat_seq_adjust_hook != NULL);
 	rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
+	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
+	rcu_assign_pointer(nfnetlink_parse_nat_setup_hook,
+			   nfnetlink_parse_nat_setup);
 	return 0;
 
  cleanup_extend:
@@ -667,10 +762,12 @@ static void __exit nf_nat_cleanup(void)
 	nf_ct_l3proto_put(l3proto);
 	nf_ct_extend_unregister(&nat_extend);
 	rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
+	rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL);
 	synchronize_net();
 }
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-ipv4");
 
 module_init(nf_nat_init);
 module_exit(nf_nat_cleanup);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 27de3c7b006..622d7c671cb 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -38,9 +38,16 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_nat.h>
 
 #define NF_CONNTRACK_VERSION	"0.5.0"
 
+unsigned int
+(*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
+				  enum nf_nat_manip_type manip,
+				  struct nlattr *attr) __read_mostly;
+EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
+
 DEFINE_SPINLOCK(nf_conntrack_lock);
 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index cadfd15b44f..08e82d64eb6 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -689,71 +689,6 @@ ctnetlink_parse_tuple(struct nlattr *cda[], struct nf_conntrack_tuple *tuple,
 	return 0;
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
-static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
-	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 },
-	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 },
-};
-
-static int nfnetlink_parse_nat_proto(struct nlattr *attr,
-				     const struct nf_conn *ct,
-				     struct nf_nat_range *range)
-{
-	struct nlattr *tb[CTA_PROTONAT_MAX+1];
-	const struct nf_nat_protocol *npt;
-	int err;
-
-	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
-	if (err < 0)
-		return err;
-
-	npt = nf_nat_proto_find_get(nf_ct_protonum(ct));
-	if (npt->nlattr_to_range)
-		err = npt->nlattr_to_range(tb, range);
-	nf_nat_proto_put(npt);
-	return err;
-}
-
-static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
-	[CTA_NAT_MINIP]		= { .type = NLA_U32 },
-	[CTA_NAT_MAXIP]		= { .type = NLA_U32 },
-};
-
-static inline int
-nfnetlink_parse_nat(struct nlattr *nat,
-		    const struct nf_conn *ct, struct nf_nat_range *range)
-{
-	struct nlattr *tb[CTA_NAT_MAX+1];
-	int err;
-
-	memset(range, 0, sizeof(*range));
-
-	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
-	if (err < 0)
-		return err;
-
-	if (tb[CTA_NAT_MINIP])
-		range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
-
-	if (!tb[CTA_NAT_MAXIP])
-		range->max_ip = range->min_ip;
-	else
-		range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
-
-	if (range->min_ip)
-		range->flags |= IP_NAT_RANGE_MAP_IPS;
-
-	if (!tb[CTA_NAT_PROTO])
-		return 0;
-
-	err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
-	if (err < 0)
-		return err;
-
-	return 0;
-}
-#endif
-
 static inline int
 ctnetlink_parse_help(struct nlattr *attr, char **helper_name)
 {
@@ -878,6 +813,34 @@ out:
 	return err;
 }
 
+static int
+ctnetlink_parse_nat_setup(struct nf_conn *ct,
+			  enum nf_nat_manip_type manip,
+			  struct nlattr *attr)
+{
+	typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+
+	parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
+	if (!parse_nat_setup) {
+#ifdef CONFIG_KMOD
+		rcu_read_unlock();
+		nfnl_unlock();
+		if (request_module("nf-nat-ipv4") < 0) {
+			nfnl_lock();
+			rcu_read_lock();
+			return -EOPNOTSUPP;
+		}
+		nfnl_lock();
+		rcu_read_lock();
+		if (nfnetlink_parse_nat_setup_hook)
+			return -EAGAIN;
+#endif
+		return -EOPNOTSUPP;
+	}
+
+	return parse_nat_setup(ct, manip, attr);
+}
+
 static int
 ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[])
 {
@@ -897,31 +860,6 @@ ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[])
 		/* ASSURED bit can only be set */
 		return -EBUSY;
 
-	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) {
-#ifndef CONFIG_NF_NAT_NEEDED
-		return -EOPNOTSUPP;
-#else
-		struct nf_nat_range range;
-
-		if (cda[CTA_NAT_DST]) {
-			if (nfnetlink_parse_nat(cda[CTA_NAT_DST], ct,
-						&range) < 0)
-				return -EINVAL;
-			if (nf_nat_initialized(ct, IP_NAT_MANIP_DST))
-				return -EEXIST;
-			nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
-		}
-		if (cda[CTA_NAT_SRC]) {
-			if (nfnetlink_parse_nat(cda[CTA_NAT_SRC], ct,
-						&range) < 0)
-				return -EINVAL;
-			if (nf_nat_initialized(ct, IP_NAT_MANIP_SRC))
-				return -EEXIST;
-			nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
-		}
-#endif
-	}
-
 	/* Be careful here, modifying NAT bits can screw up things,
 	 * so don't let users modify them directly if they don't pass
 	 * nf_nat_range. */
@@ -929,6 +867,31 @@ ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[])
 	return 0;
 }
 
+static int
+ctnetlink_change_nat(struct nf_conn *ct, struct nlattr *cda[])
+{
+#ifdef CONFIG_NF_NAT_NEEDED
+	int ret;
+
+	if (cda[CTA_NAT_DST]) {
+		ret = ctnetlink_parse_nat_setup(ct,
+						IP_NAT_MANIP_DST,
+						cda[CTA_NAT_DST]);
+		if (ret < 0)
+			return ret;
+	}
+	if (cda[CTA_NAT_SRC]) {
+		ret = ctnetlink_parse_nat_setup(ct,
+						IP_NAT_MANIP_SRC,
+						cda[CTA_NAT_SRC]);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
 
 static inline int
 ctnetlink_change_helper(struct nf_conn *ct, struct nlattr *cda[])
@@ -1157,6 +1120,14 @@ ctnetlink_create_conntrack(struct nlattr *cda[],
 		}
 	}
 
+	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) {
+		err = ctnetlink_change_nat(ct, cda);
+		if (err < 0) {
+			rcu_read_unlock();
+			goto err;
+		}
+	}
+
 	if (cda[CTA_PROTOINFO]) {
 		err = ctnetlink_change_protoinfo(ct, cda);
 		if (err < 0) {
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index b75c9c4a995..4739f9f961d 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -44,15 +44,17 @@ static struct sock *nfnl = NULL;
 static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
 static DEFINE_MUTEX(nfnl_mutex);
 
-static inline void nfnl_lock(void)
+void nfnl_lock(void)
 {
 	mutex_lock(&nfnl_mutex);
 }
+EXPORT_SYMBOL_GPL(nfnl_lock);
 
-static inline void nfnl_unlock(void)
+void nfnl_unlock(void)
 {
 	mutex_unlock(&nfnl_mutex);
 }
+EXPORT_SYMBOL_GPL(nfnl_unlock);
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
 {
@@ -132,6 +134,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return 0;
 
 	type = nlh->nlmsg_type;
+replay:
 	ss = nfnetlink_get_subsys(type);
 	if (!ss) {
 #ifdef CONFIG_KMOD
@@ -165,7 +168,10 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		} else
 			return -EINVAL;
 
-		return nc->call(nfnl, skb, nlh, cda);
+		err = nc->call(nfnl, skb, nlh, cda);
+		if (err == -EAGAIN)
+			goto replay;
+		return err;
 	}
 }
 
-- 
cgit v1.2.3


From 85cdaf524b7ddab627e7d15405693f2511ef7505 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Fri, 16 May 2008 11:49:15 +0200
Subject: HID: make a bus from hid code

Make a bus from hid core. This is the first step for converting all the
quirks and separate almost-drivers into real drivers attached to this bus.

It's implemented to change behaviour in very tiny manner, so that no driver
needs to be changed this time.

Also add generic drivers for both usb and bt into usbhid or hidp
respectively which will bind all non-blacklisted device. Those blacklisted
will be either grabbed by special drivers or by nobody if they are broken at
the very rude base.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 net/bluetooth/hidp/core.c | 64 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 96434d774c8..56a51f91591 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -578,7 +578,7 @@ static int hidp_session(void *arg)
 	if (session->hid) {
 		if (session->hid->claimed & HID_CLAIMED_INPUT)
 			hidinput_disconnect(session->hid);
-		hid_free_device(session->hid);
+		hid_destroy_device(session->hid);
 	}
 
 	/* Wakeup user-space polling for socket errors */
@@ -698,12 +698,13 @@ static void hidp_setup_quirks(struct hid_device *hid)
 			hid->quirks = hidp_blacklist[n].quirks;
 }
 
-static void hidp_setup_hid(struct hidp_session *session,
+static int hidp_setup_hid(struct hidp_session *session,
 				struct hidp_connadd_req *req)
 {
 	struct hid_device *hid = session->hid;
 	struct hid_report *report;
 	bdaddr_t src, dst;
+	int ret;
 
 	baswap(&src, &bt_sk(session->ctrl_sock->sk)->src);
 	baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst);
@@ -721,7 +722,7 @@ static void hidp_setup_hid(struct hidp_session *session,
 	strncpy(hid->phys, batostr(&src), 64);
 	strncpy(hid->uniq, batostr(&dst), 64);
 
-	hid->dev = hidp_get_device(session);
+	hid->dev.parent = hidp_get_device(session);
 
 	hid->hid_open  = hidp_open;
 	hid->hid_close = hidp_close;
@@ -738,6 +739,15 @@ static void hidp_setup_hid(struct hidp_session *session,
 
 	if (hidinput_connect(hid) == 0)
 		hid->claimed |= HID_CLAIMED_INPUT;
+
+	ret = hid_add_device(hid);
+	if (ret) {
+		if (hid->claimed & HID_CLAIMED_INPUT)
+			hidinput_disconnect(hid);
+		skb_queue_purge(&session->intr_transmit);
+	}
+
+	return ret;
 }
 
 int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock)
@@ -771,11 +781,19 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 			return -EFAULT;
 		}
 
-		session->hid = hid_parse_report(buf, req->rd_size);
+		session->hid = hid_allocate_device();
+		if (IS_ERR(session->hid)) {
+			kfree(buf);
+			kfree(session);
+			return PTR_ERR(session->hid);
+		}
+
+		err = hid_parse_report(session->hid, buf, req->rd_size);
 
 		kfree(buf);
 
-		if (!session->hid) {
+		if (err) {
+			hid_destroy_device(session->hid);
 			kfree(session);
 			return -EINVAL;
 		}
@@ -822,8 +840,11 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 			goto failed;
 	}
 
-	if (session->hid)
-		hidp_setup_hid(session, req);
+	if (session->hid) {
+		err = hidp_setup_hid(session, req);
+		if (err)
+			goto failed;
+	}
 
 	__hidp_link_session(session);
 
@@ -859,7 +880,7 @@ failed:
 	up_write(&hidp_session_sem);
 
 	if (session->hid)
-		hid_free_device(session->hid);
+		hid_destroy_device(session->hid);
 
 	input_free_device(session->input);
 	kfree(session);
@@ -950,18 +971,43 @@ int hidp_get_conninfo(struct hidp_conninfo *ci)
 	return err;
 }
 
+static const struct hid_device_id hidp_table[] = {
+	{ HID_BLUETOOTH_DEVICE(HID_ANY_ID, HID_ANY_ID) },
+	{ }
+};
+
+static struct hid_driver hidp_driver = {
+	.name = "generic-bluetooth",
+	.id_table = hidp_table,
+};
+
 static int __init hidp_init(void)
 {
+	int ret;
+
 	l2cap_load();
 
 	BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION);
 
-	return hidp_init_sockets();
+	ret = hid_register_driver(&hidp_driver);
+	if (ret)
+		goto err;
+
+	ret = hidp_init_sockets();
+	if (ret)
+		goto err_drv;
+
+	return 0;
+err_drv:
+	hid_unregister_driver(&hidp_driver);
+err:
+	return ret;
 }
 
 static void __exit hidp_exit(void)
 {
 	hidp_cleanup_sockets();
+	hid_unregister_driver(&hidp_driver);
 }
 
 module_init(hidp_init);
-- 
cgit v1.2.3


From c500c9714011edab021591340042787722db9cf0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Fri, 16 May 2008 11:49:16 +0200
Subject: HID: hid, make parsing event driven

Next step for complete hid bus, this patch includes:
- call parser either from probe or from hid-core if there is no probe.
- add ll_driver structure and centralize some stuff there (open, close...)
- split and merge usb_hid_configure and hid_probe into several functions
  to allow hooks/fixes between them

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 net/bluetooth/hidp/core.c | 191 ++++++++++++++++++++++++++--------------------
 net/bluetooth/hidp/hidp.h |   2 +
 2 files changed, 111 insertions(+), 82 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 56a51f91591..d8029cfcd45 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -623,9 +623,15 @@ static struct device *hidp_get_device(struct hidp_session *session)
 static int hidp_setup_input(struct hidp_session *session,
 				struct hidp_connadd_req *req)
 {
-	struct input_dev *input = session->input;
+	struct input_dev *input;
 	int i;
 
+	input = input_allocate_device();
+	if (!input)
+		return -ENOMEM;
+
+	session->input = input;
+
 	input_set_drvdata(input, session);
 
 	input->name = "Bluetooth HID Boot Protocol Device";
@@ -698,55 +704,117 @@ static void hidp_setup_quirks(struct hid_device *hid)
 			hid->quirks = hidp_blacklist[n].quirks;
 }
 
+static int hidp_parse(struct hid_device *hid)
+{
+	struct hidp_session *session = hid->driver_data;
+	struct hidp_connadd_req *req = session->req;
+	unsigned char *buf;
+	int ret;
+
+	buf = kmalloc(req->rd_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, req->rd_data, req->rd_size)) {
+		kfree(buf);
+		return -EFAULT;
+	}
+
+	ret = hid_parse_report(session->hid, buf, req->rd_size);
+
+	kfree(buf);
+
+	if (ret)
+		return ret;
+
+	session->req = NULL;
+
+	hidp_setup_quirks(hid);
+	return 0;
+}
+
+static int hidp_start(struct hid_device *hid)
+{
+	struct hidp_session *session = hid->driver_data;
+	struct hid_report *report;
+
+	list_for_each_entry(report, &hid->report_enum[HID_INPUT_REPORT].
+			report_list, list)
+		hidp_send_report(session, report);
+
+	list_for_each_entry(report, &hid->report_enum[HID_FEATURE_REPORT].
+			report_list, list)
+		hidp_send_report(session, report);
+
+	if (hidinput_connect(hid) == 0)
+		hid->claimed |= HID_CLAIMED_INPUT;
+
+	return 0;
+}
+
+static void hidp_stop(struct hid_device *hid)
+{
+	struct hidp_session *session = hid->driver_data;
+
+	skb_queue_purge(&session->ctrl_transmit);
+	skb_queue_purge(&session->intr_transmit);
+
+	if (hid->claimed & HID_CLAIMED_INPUT)
+		hidinput_disconnect(hid);
+	hid->claimed = 0;
+}
+
+static struct hid_ll_driver hidp_hid_driver = {
+	.parse = hidp_parse,
+	.start = hidp_start,
+	.stop = hidp_stop,
+	.open  = hidp_open,
+	.close = hidp_close,
+	.hidinput_input_event = hidp_hidinput_event,
+};
+
 static int hidp_setup_hid(struct hidp_session *session,
 				struct hidp_connadd_req *req)
 {
-	struct hid_device *hid = session->hid;
-	struct hid_report *report;
+	struct hid_device *hid;
 	bdaddr_t src, dst;
 	int ret;
 
-	baswap(&src, &bt_sk(session->ctrl_sock->sk)->src);
-	baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst);
+	hid = hid_allocate_device();
+	if (IS_ERR(hid)) {
+		ret = PTR_ERR(session->hid);
+		goto err;
+	}
 
+	session->hid = hid;
+	session->req = req;
 	hid->driver_data = session;
 
-	hid->country = req->country;
+	baswap(&src, &bt_sk(session->ctrl_sock->sk)->src);
+	baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst);
 
 	hid->bus     = BUS_BLUETOOTH;
 	hid->vendor  = req->vendor;
 	hid->product = req->product;
 	hid->version = req->version;
+	hid->country = req->country;
 
 	strncpy(hid->name, req->name, 128);
 	strncpy(hid->phys, batostr(&src), 64);
 	strncpy(hid->uniq, batostr(&dst), 64);
 
 	hid->dev.parent = hidp_get_device(session);
-
-	hid->hid_open  = hidp_open;
-	hid->hid_close = hidp_close;
-
-	hid->hidinput_input_event = hidp_hidinput_event;
-
-	hidp_setup_quirks(hid);
-
-	list_for_each_entry(report, &hid->report_enum[HID_INPUT_REPORT].report_list, list)
-		hidp_send_report(session, report);
-
-	list_for_each_entry(report, &hid->report_enum[HID_FEATURE_REPORT].report_list, list)
-		hidp_send_report(session, report);
-
-	if (hidinput_connect(hid) == 0)
-		hid->claimed |= HID_CLAIMED_INPUT;
+	hid->ll_driver = &hidp_hid_driver;
 
 	ret = hid_add_device(hid);
-	if (ret) {
-		if (hid->claimed & HID_CLAIMED_INPUT)
-			hidinput_disconnect(hid);
-		skb_queue_purge(&session->intr_transmit);
-	}
+	if (ret)
+		goto err_hid;
 
+	return 0;
+err_hid:
+	hid_destroy_device(hid);
+	session->hid = NULL;
+err:
 	return ret;
 }
 
@@ -767,46 +835,6 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 
 	BT_DBG("rd_data %p rd_size %d", req->rd_data, req->rd_size);
 
-	if (req->rd_size > 0) {
-		unsigned char *buf = kmalloc(req->rd_size, GFP_KERNEL);
-
-		if (!buf) {
-			kfree(session);
-			return -ENOMEM;
-		}
-
-		if (copy_from_user(buf, req->rd_data, req->rd_size)) {
-			kfree(buf);
-			kfree(session);
-			return -EFAULT;
-		}
-
-		session->hid = hid_allocate_device();
-		if (IS_ERR(session->hid)) {
-			kfree(buf);
-			kfree(session);
-			return PTR_ERR(session->hid);
-		}
-
-		err = hid_parse_report(session->hid, buf, req->rd_size);
-
-		kfree(buf);
-
-		if (err) {
-			hid_destroy_device(session->hid);
-			kfree(session);
-			return -EINVAL;
-		}
-	}
-
-	if (!session->hid) {
-		session->input = input_allocate_device();
-		if (!session->input) {
-			kfree(session);
-			return -ENOMEM;
-		}
-	}
-
 	down_write(&hidp_session_sem);
 
 	s = __hidp_get_session(&bt_sk(ctrl_sock->sk)->dst);
@@ -834,16 +862,16 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 	session->flags   = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID);
 	session->idle_to = req->idle_to;
 
-	if (session->input) {
-		err = hidp_setup_input(session, req);
-		if (err < 0)
-			goto failed;
-	}
-
-	if (session->hid) {
+	if (req->rd_size > 0) {
 		err = hidp_setup_hid(session, req);
 		if (err)
-			goto failed;
+			goto err_skb;
+	}
+
+	if (!session->hid) {
+		err = hidp_setup_input(session, req);
+		if (err < 0)
+			goto err_skb;
 	}
 
 	__hidp_link_session(session);
@@ -871,16 +899,15 @@ unlink:
 
 	__hidp_unlink_session(session);
 
-	if (session->input) {
+	if (session->input)
 		input_unregister_device(session->input);
-		session->input = NULL; /* don't try to free it here */
-	}
-
-failed:
-	up_write(&hidp_session_sem);
-
 	if (session->hid)
 		hid_destroy_device(session->hid);
+err_skb:
+	skb_queue_purge(&session->ctrl_transmit);
+	skb_queue_purge(&session->intr_transmit);
+failed:
+	up_write(&hidp_session_sem);
 
 	input_free_device(session->input);
 	kfree(session);
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index 343fb0566b3..e503c89057a 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -151,6 +151,8 @@ struct hidp_session {
 
 	struct sk_buff_head ctrl_transmit;
 	struct sk_buff_head intr_transmit;
+
+	struct hidp_connadd_req *req;
 };
 
 static inline void hidp_schedule(struct hidp_session *session)
-- 
cgit v1.2.3


From d458a9dfc4de24870b8c747484b1988726534bee Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Fri, 16 May 2008 11:49:20 +0200
Subject: HID: move ignore quirks

Move ignore quirks from usbhid-quirks into hid-core code. Also don't output
warning when ENODEV is error code in usbhid and try ordinal input in hidp
when that error is returned.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 net/bluetooth/hidp/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index d8029cfcd45..4ae3207e8a9 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -864,7 +864,7 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 
 	if (req->rd_size > 0) {
 		err = hidp_setup_hid(session, req);
-		if (err)
+		if (err && err != -ENODEV)
 			goto err_skb;
 	}
 
-- 
cgit v1.2.3


From 8c19a51591d06f5226499972567f528cf6066bb7 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 18 Jun 2008 23:36:49 +0200
Subject: HID: move apple quirks

Move them from the core code to a separate driver.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 net/bluetooth/hidp/core.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 4ae3207e8a9..f3d830747b9 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -683,27 +683,6 @@ static void hidp_close(struct hid_device *hid)
 {
 }
 
-static const struct {
-	__u16 idVendor;
-	__u16 idProduct;
-	unsigned quirks;
-} hidp_blacklist[] = {
-	/* Apple wireless Mighty Mouse */
-	{ 0x05ac, 0x030c, HID_QUIRK_MIGHTYMOUSE | HID_QUIRK_INVERT_HWHEEL },
-
-	{ }	/* Terminating entry */
-};
-
-static void hidp_setup_quirks(struct hid_device *hid)
-{
-	unsigned int n;
-
-	for (n = 0; hidp_blacklist[n].idVendor; n++)
-		if (hidp_blacklist[n].idVendor == le16_to_cpu(hid->vendor) &&
-				hidp_blacklist[n].idProduct == le16_to_cpu(hid->product))
-			hid->quirks = hidp_blacklist[n].quirks;
-}
-
 static int hidp_parse(struct hid_device *hid)
 {
 	struct hidp_session *session = hid->driver_data;
@@ -729,7 +708,6 @@ static int hidp_parse(struct hid_device *hid)
 
 	session->req = NULL;
 
-	hidp_setup_quirks(hid);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 93c10132a7ac160df3175b53f7ee857625412165 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Fri, 27 Jun 2008 00:04:24 +0200
Subject: HID: move connect quirks

Move connecting from usbhid to the hid layer and fix also hidp in
that manner.
This removes all the ignore/force hidinput/hiddev connecting quirks.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 net/bluetooth/hidp/core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index f3d830747b9..acdeab3d980 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -724,9 +724,6 @@ static int hidp_start(struct hid_device *hid)
 			report_list, list)
 		hidp_send_report(session, report);
 
-	if (hidinput_connect(hid) == 0)
-		hid->claimed |= HID_CLAIMED_INPUT;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 63044e9f54b6bac50d2380bf4d14f63e9e7de72b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 7 Oct 2008 12:04:29 +0200
Subject: mac80211: fix debugfs lockup

When debugfs_create_dir fails, sta_info_debugfs_add_work will not
terminate because it will find the same station again and again.
This is possible whenever debugfs fails for whatever reason; one
reason is a race condition in mac80211, unfortunately we cannot
do much about it, so just document it, it just means some station
may be missing from debugfs.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Robin Holt <holt@sgi.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/debugfs_sta.c | 11 +++++++++++
 net/mac80211/sta_info.c    |  7 ++++++-
 net/mac80211/sta_info.h    |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index b9902e425f0..189d0bafa91 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -249,11 +249,22 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	DECLARE_MAC_BUF(mbuf);
 	u8 *mac;
 
+	sta->debugfs.add_has_run = true;
+
 	if (!stations_dir)
 		return;
 
 	mac = print_mac(mbuf, sta->sta.addr);
 
+	/*
+	 * This might fail due to a race condition:
+	 * When mac80211 unlinks a station, the debugfs entries
+	 * remain, but it is already possible to link a new
+	 * station with the same address which triggers adding
+	 * it to debugfs; therefore, if the old station isn't
+	 * destroyed quickly enough the old station's debugfs
+	 * dir might still be around.
+	 */
 	sta->debugfs.dir = debugfs_create_dir(mac, stations_dir);
 	if (!sta->debugfs.dir)
 		return;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 9b72d15bc8d..7fef8ea1f5e 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -635,7 +635,12 @@ static void sta_info_debugfs_add_work(struct work_struct *work)
 
 		spin_lock_irqsave(&local->sta_lock, flags);
 		list_for_each_entry(tmp, &local->sta_list, list) {
-			if (!tmp->debugfs.dir) {
+			/*
+			 * debugfs.add_has_run will be set by
+			 * ieee80211_sta_debugfs_add regardless
+			 * of what else it does.
+			 */
+			if (!tmp->debugfs.add_has_run) {
 				sta = tmp;
 				__sta_info_pin(sta);
 				break;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index a6b51862a89..168a39a298b 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -300,6 +300,7 @@ struct sta_info {
 		struct dentry *inactive_ms;
 		struct dentry *last_seq_ctrl;
 		struct dentry *agg_status;
+		bool add_has_run;
 	} debugfs;
 #endif
 
-- 
cgit v1.2.3


From 09914813da37f1ee9d77998a0701629cfbbd98f4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 7 Oct 2008 19:31:17 +0200
Subject: mac80211: fix HT information element parsing

There's no checking that the HT IEs are of the right length
which can be used by an attacker to cause an out-of-bounds
access by sending a too short HT information/capability IE.
Fix it by simply pretending those IEs didn't exist when too
short.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h | 6 ++----
 net/mac80211/mlme.c        | 3 ---
 net/mac80211/util.c        | 8 ++++----
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8025b294588..156e42a003a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -816,8 +816,8 @@ struct ieee802_11_elems {
 	u8 *ext_supp_rates;
 	u8 *wmm_info;
 	u8 *wmm_param;
-	u8 *ht_cap_elem;
-	u8 *ht_info_elem;
+	struct ieee80211_ht_cap *ht_cap_elem;
+	struct ieee80211_ht_addt_info *ht_info_elem;
 	u8 *mesh_config;
 	u8 *mesh_id;
 	u8 *peer_link;
@@ -844,8 +844,6 @@ struct ieee802_11_elems {
 	u8 ext_supp_rates_len;
 	u8 wmm_info_len;
 	u8 wmm_param_len;
-	u8 ht_cap_elem_len;
-	u8 ht_info_elem_len;
 	u8 mesh_config_len;
 	u8 mesh_id_len;
 	u8 peer_link_len;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 49f86fa56bf..87665d7bb4f 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1348,10 +1348,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	    (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
 		struct ieee80211_ht_bss_info bss_info;
 		ieee80211_ht_cap_ie_to_ht_info(
-				(struct ieee80211_ht_cap *)
 				elems.ht_cap_elem, &sta->sta.ht_info);
 		ieee80211_ht_addt_info_ie_to_ht_bss_info(
-				(struct ieee80211_ht_addt_info *)
 				elems.ht_info_elem, &bss_info);
 		ieee80211_handle_ht(local, 1, &sta->sta.ht_info, &bss_info);
 	}
@@ -1709,7 +1707,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_ht_bss_info bss_info;
 
 		ieee80211_ht_addt_info_ie_to_ht_bss_info(
-				(struct ieee80211_ht_addt_info *)
 				elems.ht_info_elem, &bss_info);
 		changed |= ieee80211_handle_ht(local, 1, &conf->ht_conf,
 					       &bss_info);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f32561ec224..cee4884b9d0 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -529,12 +529,12 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 			elems->ext_supp_rates_len = elen;
 			break;
 		case WLAN_EID_HT_CAPABILITY:
-			elems->ht_cap_elem = pos;
-			elems->ht_cap_elem_len = elen;
+			if (elen >= sizeof(struct ieee80211_ht_cap))
+				elems->ht_cap_elem = (void *)pos;
 			break;
 		case WLAN_EID_HT_EXTRA_INFO:
-			elems->ht_info_elem = pos;
-			elems->ht_info_elem_len = elen;
+			if (elen >= sizeof(struct ieee80211_ht_addt_info))
+				elems->ht_info_elem = (void *)pos;
 			break;
 		case WLAN_EID_MESH_ID:
 			elems->mesh_id = pos;
-- 
cgit v1.2.3


From c74e90a9e37c4a3923905189a6ebbd7ef61e6e67 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 8 Oct 2008 10:18:36 +0200
Subject: mac80211: fix debugfs netdev rename

If, for some reason, a netdev has no debugfs dir, we shouldn't
try to rename that dir.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Robin Holt <holt@sgi.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/debugfs_netdev.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 2a451562377..2ad504fc341 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -545,8 +545,12 @@ static int netdev_notify(struct notifier_block *nb,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	sprintf(buf, "netdev:%s", dev->name);
 	dir = sdata->debugfsdir;
+
+	if (!dir)
+		return 0;
+
+	sprintf(buf, "netdev:%s", dev->name);
 	if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf))
 		printk(KERN_ERR "mac80211: debugfs: failed to rename debugfs "
 		       "dir to %s\n", buf);
-- 
cgit v1.2.3


From 33c0360bf74d5fded34cb08d3512ada32ad661e4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 8 Oct 2008 10:23:48 +0200
Subject: cfg80211: fix debugfs error handling

If something goes wrong creating the debugfs dir or when
debugfs is not compiled in, the current code might lead to
trouble; make it more robust.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index 24fdd4cd22c..5031db7b275 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -184,7 +184,8 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
 	if (result)
 		goto out_unlock;
 
-	if (!debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
+	if (rdev->wiphy.debugfsdir &&
+	    !debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
 			    rdev->wiphy.debugfsdir,
 			    rdev->wiphy.debugfsdir->d_parent,
 			    newname))
@@ -317,6 +318,8 @@ int wiphy_register(struct wiphy *wiphy)
 	drv->wiphy.debugfsdir =
 		debugfs_create_dir(wiphy_name(&drv->wiphy),
 				   ieee80211_debugfs_dir);
+	if (IS_ERR(drv->wiphy.debugfsdir))
+		drv->wiphy.debugfsdir = NULL;
 
 	res = 0;
 out_unlock:
-- 
cgit v1.2.3


From d048e503a2b01e771ee87921c24d89d7ec3f0c2f Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Sat, 11 Oct 2008 03:29:55 +0300
Subject: mac80211: Fix scan RX processing oops

ieee80211_bss_info_update() can return NULL. Verify that this is not the
case before calling ieee802111_rx_bss_put() which would trigger an oops
in interrupt context in atomic_dec_and_lock().

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Benoit Papillault <benoit.papillault@free.fr>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/scan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 8e6685e7ae8..416bb41099f 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -388,7 +388,8 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 	bss = ieee80211_bss_info_update(sdata->local, rx_status,
 					mgmt, skb->len, &elems,
 					freq, beacon);
-	ieee80211_rx_bss_put(sdata->local, bss);
+	if (bss)
+		ieee80211_rx_bss_put(sdata->local, bss);
 
 	dev_kfree_skb(skb);
 	return RX_QUEUED;
-- 
cgit v1.2.3


From 4233df6b748193d45f79fb7448991a473061a65d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 13 Oct 2008 13:35:05 +0200
Subject: ath9k/mac80211: disallow fragmentation in ath9k, report to userspace

As I've reported, ath9k currently fails utterly when fragmentation
is enabled. This makes ath9k "support" hardware fragmentation by
not supporting fragmentation at all to avoid the double-free issue.
The patch also changes mac80211 to report errors from the driver
operation to userspace.

That hack in ath9k should be removed once the rate control algorithm
it has is fixed, and we can at that time consider removing the hw
fragmentation support entirely since it's not used by any driver.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: stable@kernel.org
Acked-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 7e0d53abde2..742f811ca41 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -775,7 +775,7 @@ static int ieee80211_ioctl_siwfrag(struct net_device *dev,
 	 * configure it here */
 
 	if (local->ops->set_frag_threshold)
-		local->ops->set_frag_threshold(
+		return local->ops->set_frag_threshold(
 			local_to_hw(local),
 			local->fragmentation_threshold);
 
-- 
cgit v1.2.3


From 4ef079ccc1d934c5f9966f2bfcd5dbbef8f7a0a7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 Oct 2008 22:54:48 -0700
Subject: netns: fix net_generic array leak

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0dc818a91d..f1d07b5c1e1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -96,7 +96,7 @@ static void net_free(struct net *net)
 		return;
 	}
 #endif
-
+	kfree(net->gen);
 	kmem_cache_free(net_cachep, net);
 }
 
-- 
cgit v1.2.3


From eef9d90dcde7bb4d029b67ed36457efc4970d5a2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 Oct 2008 22:55:21 -0700
Subject: netns: correct mib stats in ip6_route_me_harder()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 6b29b03925f..fd5b3a4e332 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -12,6 +12,7 @@
 
 int ip6_route_me_harder(struct sk_buff *skb)
 {
+	struct net *net = dev_net(skb->dst->dev);
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct dst_entry *dst;
 	struct flowi fl = {
@@ -23,7 +24,7 @@ int ip6_route_me_harder(struct sk_buff *skb)
 		    .saddr = iph->saddr, } },
 	};
 
-	dst = ip6_route_output(dev_net(skb->dst->dev), skb->sk, &fl);
+	dst = ip6_route_output(net, skb->sk, &fl);
 
 #ifdef CONFIG_XFRM
 	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -33,8 +34,7 @@ int ip6_route_me_harder(struct sk_buff *skb)
 #endif
 
 	if (dst->error) {
-		IP6_INC_STATS(&init_net, ip6_dst_idev(dst),
-			      IPSTATS_MIB_OUTNOROUTES);
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
 		dst_release(dst);
 		return -EINVAL;
-- 
cgit v1.2.3


From deb28d9bc4bb6922c1f7e459744d7b2d0db3a1d2 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Wed, 15 Oct 2008 00:13:53 -0700
Subject: net/802/fc.c: Fix compilation warnings

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/fc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/802/fc.c b/net/802/fc.c
index cb3475ea6fd..34cf1ee014b 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -82,13 +82,13 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
 
 static int fc_rebuild_header(struct sk_buff *skb)
 {
+#ifdef CONFIG_INET
 	struct fch_hdr *fch=(struct fch_hdr *)skb->data;
 	struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
 	if(fcllc->ethertype != htons(ETH_P_IP)) {
 		printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
 		return 0;
 	}
-#ifdef CONFIG_INET
 	return arp_find(fch->daddr, skb);
 #else
 	return 0;
-- 
cgit v1.2.3


From 22441cfa0c70dcd457f3c081fcf285c3bd155824 Mon Sep 17 00:00:00 2001
From: Pedro Ribeiro <pribeiro@net.ipl.pt>
Date: Wed, 15 Oct 2008 15:47:49 -0700
Subject: IPV6: Fix default gateway criteria wrt. HIGH/LOW preference radv
 option

Problem observed:
               In IPv6, in the presence of multiple routers candidates to
               default gateway in one segment, each sending a different
               value of preference, the Linux hosts connected to the
               segment weren't selecting the right one in all the
               combinations possible of LOW/MEDIUM/HIGH preference.

This patch changes two files:
include/linux/icmpv6.h
               Get the "router_pref" bitfield in the right place
               (as RFC4191 says), named the bit left with this fix as
               "home_agent" (RFC3775 say that's his function)

net/ipv6/ndisc.c
               Corrects the binary logic behind the updating of the
               router preference in the flags of the routing table

Result:
               With this two fixes applied, the default route used by
               the system was to consistent with the rules mentioned
               in RFC4191 in case of changes in the value of preference
               in router advertisements

Signed-off-by: Pedro Ribeiro <pribeiro@net.ipl.pt>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 840b15780a3..aae7ddcc8a2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1199,7 +1199,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		}
 		neigh->flags |= NTF_ROUTER;
 	} else if (rt) {
-		rt->rt6i_flags |= (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+		rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 	}
 
 	if (rt)
-- 
cgit v1.2.3


From 8fa0b315fc0c1a414da1371f1fc39523a657c192 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 15 Oct 2008 15:59:50 -0700
Subject: decnet: Fix compiler warning in dn_dev.c

Use offsetof() instead of home-brewed version.

Based upon initial patch by Steven Whitehouse and suggestions
by Ben Hutchings.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 2f0ac3c3eb7..ba352588e34 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -152,7 +152,7 @@ static struct dn_dev_parms dn_dev_list[] =  {
 
 #define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list)
 
-#define DN_DEV_PARMS_OFFSET(x) ((int) ((char *) &((struct dn_dev_parms *)0)->x))
+#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x)
 
 #ifdef CONFIG_SYSCTL
 
-- 
cgit v1.2.3


From 346e15beb5343c2eb8216d820f2ed8f150822b08 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Tue, 12 Aug 2008 16:46:19 -0400
Subject: driver core: basic infrastructure for per-module dynamic debug
 messages

Base infrastructure to enable per-module debug messages.

I've introduced CONFIG_DYNAMIC_PRINTK_DEBUG, which when enabled centralizes
control of debugging statements on a per-module basis in one /proc file,
currently, <debugfs>/dynamic_printk/modules. When, CONFIG_DYNAMIC_PRINTK_DEBUG,
is not set, debugging statements can still be enabled as before, often by
defining 'DEBUG' for the proper compilation unit. Thus, this patch set has no
affect when CONFIG_DYNAMIC_PRINTK_DEBUG is not set.

The infrastructure currently ties into all pr_debug() and dev_dbg() calls. That
is, if CONFIG_DYNAMIC_PRINTK_DEBUG is set, all pr_debug() and dev_dbg() calls
can be dynamically enabled/disabled on a per-module basis.

Future plans include extending this functionality to subsystems, that define
their own debug levels and flags.

Usage:

Dynamic debugging is controlled by the debugfs file,
<debugfs>/dynamic_printk/modules. This file contains a list of the modules that
can be enabled. The format of the file is as follows:

	<module_name> <enabled=0/1>
		.
		.
		.

	<module_name> : Name of the module in which the debug call resides
	<enabled=0/1> : whether the messages are enabled or not

For example:

	snd_hda_intel enabled=0
	fixup enabled=1
	driver enabled=0

Enable a module:

	$echo "set enabled=1 <module_name>" > dynamic_printk/modules

Disable a module:

	$echo "set enabled=0 <module_name>" > dynamic_printk/modules

Enable all modules:

	$echo "set enabled=1 all" > dynamic_printk/modules

Disable all modules:

	$echo "set enabled=0 all" > dynamic_printk/modules

Finally, passing "dynamic_printk" at the command line enables
debugging for all modules. This mode can be turned off via the above
disable command.

[gkh: minor cleanups and tweaks to make the build work quietly]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/netfilter/nf_conntrack_pptp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 373e51e91ce..1bc3001d182 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -65,7 +65,7 @@ void
 			     struct nf_conntrack_expect *exp) __read_mostly;
 EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
 /* PptpControlMessageType names */
 const char *const pptp_msg_name[] = {
 	"UNKNOWN_MESSAGE",
-- 
cgit v1.2.3


From 404d0ae289f7a76ff233e8fbfde8b1e7b6e62ae3 Mon Sep 17 00:00:00 2001
From: Danny ter Haar <dth@cistron.nl>
Date: Wed, 15 Oct 2008 22:01:34 -0700
Subject: fix random typos

Signed-off-by: Danny ter Haar <dth@cistron.nl>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Avi Kivity <avi@qumranet.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/netfilter/nf_conntrack_acct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 03591d37b9c..b92df5c1dfc 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -115,7 +115,7 @@ int nf_conntrack_acct_init(struct net *net)
 
 	if (net_eq(net, &init_net)) {
 #ifdef CONFIG_NF_CT_ACCT
-		printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n");
+	printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Please use\n");
 		printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n");
 		printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n");
 #endif
-- 
cgit v1.2.3


From f221e726bf4e082a05dcd573379ac859bfba7126 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Oct 2008 22:04:23 -0700
Subject: sysctl: simplify ->strategy

name and nlen parameters passed to ->strategy hook are unused, remove
them.  In general ->strategy hook should know what it's doing, and don't
do something tricky for which, say, pointer to original userspace array
may be needed (name).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net> [ networking bits ]
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/decnet/dn_dev.c            |  4 ++--
 net/decnet/sysctl_net_decnet.c |  4 ++--
 net/ipv4/devinet.c             |  7 +++----
 net/ipv4/route.c               |  7 +------
 net/ipv4/sysctl_net_ipv4.c     | 18 +++++++++---------
 net/ipv6/addrconf.c            |  1 -
 net/ipv6/ndisc.c               | 11 ++++-------
 7 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 2f0ac3c3eb7..96f9fcef295 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -166,7 +166,7 @@ static int max_priority[] = { 127 }; /* From DECnet spec */
 
 static int dn_forwarding_proc(ctl_table *, int, struct file *,
 			void __user *, size_t *, loff_t *);
-static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen,
+static int dn_forwarding_sysctl(ctl_table *table,
 			void __user *oldval, size_t __user *oldlenp,
 			void __user *newval, size_t newlen);
 
@@ -318,7 +318,7 @@ static int dn_forwarding_proc(ctl_table *table, int write,
 #endif
 }
 
-static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen,
+static int dn_forwarding_sysctl(ctl_table *table,
 			void __user *oldval, size_t __user *oldlenp,
 			void __user *newval, size_t newlen)
 {
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 228067c571b..36400b26689 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -132,7 +132,7 @@ static int parse_addr(__le16 *addr, char *str)
 }
 
 
-static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen,
+static int dn_node_address_strategy(ctl_table *table,
 				void __user *oldval, size_t __user *oldlenp,
 				void __user *newval, size_t newlen)
 {
@@ -217,7 +217,7 @@ static int dn_node_address_handler(ctl_table *table, int write,
 }
 
 
-static int dn_def_dev_strategy(ctl_table *table, int __user *name, int nlen,
+static int dn_def_dev_strategy(ctl_table *table,
 				void __user *oldval, size_t __user *oldlenp,
 				void __user *newval, size_t newlen)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index b12dae2b0b2..5154e729cf1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1283,7 +1283,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
 	return ret;
 }
 
-static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen,
+static int devinet_conf_sysctl(ctl_table *table,
 			       void __user *oldval, size_t __user *oldlenp,
 			       void __user *newval, size_t newlen)
 {
@@ -1379,12 +1379,11 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
 	return ret;
 }
 
-int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
+int ipv4_doint_and_flush_strategy(ctl_table *table,
 				  void __user *oldval, size_t __user *oldlenp,
 				  void __user *newval, size_t newlen)
 {
-	int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp,
-				      newval, newlen);
+	int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen);
 	struct net *net = table->extra2;
 
 	if (ret == 1)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a6d7c584f53..942be04e795 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2908,8 +2908,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 }
 
 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
-						int __user *name,
-						int nlen,
 						void __user *oldval,
 						size_t __user *oldlenp,
 						void __user *newval,
@@ -2972,16 +2970,13 @@ static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
 }
 
 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
-						   int __user *name,
-						   int nlen,
 						   void __user *oldval,
 						   size_t __user *oldlenp,
 						   void __user *newval,
 						   size_t newlen)
 {
 	int old = ip_rt_secret_interval;
-	int ret = sysctl_jiffies(table, name, nlen, oldval, oldlenp, newval,
-				 newlen);
+	int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
 
 	rt_secret_reschedule(old);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 276d047fb85..1bb10df8ce7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -64,8 +64,8 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
 }
 
 /* Validate changes from sysctl interface. */
-static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name,
-					 int nlen, void __user *oldval,
+static int ipv4_sysctl_local_port_range(ctl_table *table,
+					 void __user *oldval,
 					 size_t __user *oldlenp,
 					void __user *newval, size_t newlen)
 {
@@ -80,7 +80,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name,
 	};
 
 	inet_get_local_port_range(range, range + 1);
-	ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen);
+	ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen);
 	if (ret == 0 && newval && newlen) {
 		if (range[1] < range[0])
 			ret = -EINVAL;
@@ -109,8 +109,8 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 	return ret;
 }
 
-static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
-					 int nlen, void __user *oldval,
+static int sysctl_tcp_congestion_control(ctl_table *table,
+					 void __user *oldval,
 					 size_t __user *oldlenp,
 					 void __user *newval, size_t newlen)
 {
@@ -122,7 +122,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
 	int ret;
 
 	tcp_get_default_congestion_control(val);
-	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen);
+	ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
 	if (ret == 1 && newval && newlen)
 		ret = tcp_set_default_congestion_control(val);
 	return ret;
@@ -165,8 +165,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
 	return ret;
 }
 
-static int strategy_allowed_congestion_control(ctl_table *table, int __user *name,
-					       int nlen, void __user *oldval,
+static int strategy_allowed_congestion_control(ctl_table *table,
+					       void __user *oldval,
 					       size_t __user *oldlenp,
 					       void __user *newval,
 					       size_t newlen)
@@ -179,7 +179,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam
 		return -ENOMEM;
 
 	tcp_get_available_congestion_control(tbl.data, tbl.maxlen);
-	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen);
+	ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
 	if (ret == 1 && newval && newlen)
 		ret = tcp_set_allowed_congestion_control(tbl.data);
 	kfree(tbl.data);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7b6a584b62d..eea9542728c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3982,7 +3982,6 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
 }
 
 static int addrconf_sysctl_forward_strategy(ctl_table *table,
-					    int __user *name, int nlen,
 					    void __user *oldval,
 					    size_t __user *oldlenp,
 					    void __user *newval, size_t newlen)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 840b15780a3..7f39e9b3645 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1730,9 +1730,8 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
 	return ret;
 }
 
-int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
-				 int nlen, void __user *oldval,
-				 size_t __user *oldlenp,
+int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl,
+				 void __user *oldval, size_t __user *oldlenp,
 				 void __user *newval, size_t newlen)
 {
 	struct net_device *dev = ctl->extra1;
@@ -1745,13 +1744,11 @@ int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
 
 	switch (ctl->ctl_name) {
 	case NET_NEIGH_REACHABLE_TIME:
-		ret = sysctl_jiffies(ctl, name, nlen,
-				     oldval, oldlenp, newval, newlen);
+		ret = sysctl_jiffies(ctl, oldval, oldlenp, newval, newlen);
 		break;
 	case NET_NEIGH_RETRANS_TIME_MS:
 	case NET_NEIGH_REACHABLE_TIME_MS:
-		 ret = sysctl_ms_jiffies(ctl, name, nlen,
-					 oldval, oldlenp, newval, newlen);
+		 ret = sysctl_ms_jiffies(ctl, oldval, oldlenp, newval, newlen);
 		 break;
 	default:
 		ret = 0;
-- 
cgit v1.2.3


From 00269b54edbf25f3bb0dccb558ae23a6fc77ed86 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 16 Oct 2008 14:18:29 -0700
Subject: ipv4: Add a missing rcu_assign_pointer() in routing cache.

rt_intern_hash() is doing an update of a RCU guarded hash chain
without using rcu_assign_pointer() or equivalent barrier.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a6d7c584f53..8d23cc7efba 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1109,7 +1109,12 @@ restart:
 		printk("\n");
 	}
 #endif
-	rt_hash_table[hash].chain = rt;
+	/*
+	 * Since lookup is lockfree, we must make sure
+	 * previous writes to rt are comitted to memory
+	 * before making rt visible to other CPUS.
+	 */
+	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
-- 
cgit v1.2.3


From 95a5afca4a8d2e1cb77e1d4bc6ff9f718dc32f7a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 16 Oct 2008 15:24:51 -0700
Subject: net: Remove CONFIG_KMOD from net/ (towards removing CONFIG_KMOD
 entirely)

Some code here depends on CONFIG_KMOD to not try to load
protocol modules or similar, replace by CONFIG_MODULES
where more than just request_module depends on CONFIG_KMOD
and and also use try_then_request_module in ebtables.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/af_bluetooth.c         |  8 +-------
 net/bridge/netfilter/ebtables.c      | 15 +++------------
 net/can/af_can.c                     |  4 ++--
 net/core/dev.c                       |  2 --
 net/core/rtnetlink.c                 |  4 ++--
 net/dccp/ccid.c                      |  2 +-
 net/decnet/dn_dev.c                  |  2 --
 net/ipv4/devinet.c                   |  2 --
 net/ipv4/inet_diag.c                 |  2 --
 net/ipv4/tcp_cong.c                  |  4 ++--
 net/netfilter/nf_conntrack_netlink.c |  2 +-
 net/netfilter/nfnetlink.c            |  2 +-
 net/netlink/af_netlink.c             |  2 +-
 net/phonet/af_phonet.c               |  3 +--
 net/sched/act_api.c                  |  2 +-
 net/sched/cls_api.c                  |  2 +-
 net/sched/ematch.c                   |  2 +-
 net/sched/sch_api.c                  |  2 +-
 net/socket.c                         |  2 +-
 net/sunrpc/auth.c                    |  2 --
 20 files changed, 20 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index f6348e078aa..8f9431a12c6 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -37,10 +37,7 @@
 #include <linux/poll.h>
 #include <net/sock.h>
 #include <asm/ioctls.h>
-
-#if defined(CONFIG_KMOD)
 #include <linux/kmod.h>
-#endif
 
 #include <net/bluetooth/bluetooth.h>
 
@@ -145,11 +142,8 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto)
 	if (proto < 0 || proto >= BT_MAX_PROTO)
 		return -EINVAL;
 
-#if defined(CONFIG_KMOD)
-	if (!bt_proto[proto]) {
+	if (!bt_proto[proto])
 		request_module("bt-proto-%d", proto);
-	}
-#endif
 
 	err = -EPROTONOSUPPORT;
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 5bb88eb0aad..0fa208e8640 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -305,23 +305,14 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
 	return NULL;
 }
 
-#ifndef CONFIG_KMOD
-#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
-#else
 static void *
 find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
    int *error, struct mutex *mutex)
 {
-	void *ret;
-
-	ret = find_inlist_lock_noload(head, name, error, mutex);
-	if (!ret) {
-		request_module("%s%s", prefix, name);
-		ret = find_inlist_lock_noload(head, name, error, mutex);
-	}
-	return ret;
+	return try_then_request_module(
+			find_inlist_lock_noload(head, name, error, mutex),
+			"%s%s", prefix, name);
 }
-#endif
 
 static inline struct ebt_table *
 find_table_lock(const char *name, int *error, struct mutex *mutex)
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 8035fbf526a..7d4d2b3c137 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -128,8 +128,8 @@ static int can_create(struct net *net, struct socket *sock, int protocol)
 	if (net != &init_net)
 		return -EAFNOSUPPORT;
 
-#ifdef CONFIG_KMOD
-	/* try to load protocol module, when CONFIG_KMOD is defined */
+#ifdef CONFIG_MODULES
+	/* try to load protocol module kernel is modular */
 	if (!proto_tab[protocol]) {
 		err = request_module("can-proto-%d", protocol);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 1408a083fe4..868ec0ba8b7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4956,8 +4956,6 @@ EXPORT_SYMBOL(br_fdb_get_hook);
 EXPORT_SYMBOL(br_fdb_put_hook);
 #endif
 
-#ifdef CONFIG_KMOD
 EXPORT_SYMBOL(dev_load);
-#endif
 
 EXPORT_PER_CPU_SYMBOL(softnet_data);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3630131fa1f..31f29d2989f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1040,7 +1040,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
 	int err;
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 replay:
 #endif
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
@@ -1129,7 +1129,7 @@ replay:
 			return -EOPNOTSUPP;
 
 		if (!ops) {
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 			if (kind[0]) {
 				__rtnl_unlock();
 				request_module("rtnl-link-%s", kind);
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 4809753d12a..8fe931a3d7a 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -154,7 +154,7 @@ struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
 	struct ccid *ccid = NULL;
 
 	ccids_read_lock();
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	if (ccids[id] == NULL) {
 		/* We only try to load if in process context */
 		ccids_read_unlock();
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index ba352588e34..4fd4a4f74e8 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -490,9 +490,7 @@ int dn_dev_ioctl(unsigned int cmd, void __user *arg)
 		return -EFAULT;
 	ifr->ifr_name[IFNAMSIZ-1] = 0;
 
-#ifdef CONFIG_KMOD
 	dev_load(&init_net, ifr->ifr_name);
-#endif
 
 	switch(cmd) {
 		case SIOCGIFADDR:
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index b12dae2b0b2..abef49376ac 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -613,9 +613,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	if (colon)
 		*colon = 0;
 
-#ifdef CONFIG_KMOD
 	dev_load(net, ifr.ifr_name);
-#endif
 
 	switch (cmd) {
 	case SIOCGIFADDR:	/* Get interface address */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 89cb047ab31..564230dabcb 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -53,11 +53,9 @@ static DEFINE_MUTEX(inet_diag_table_mutex);
 
 static const struct inet_diag_handler *inet_diag_lock_handler(int type)
 {
-#ifdef CONFIG_KMOD
 	if (!inet_diag_table[type])
 		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
 			       NETLINK_INET_DIAG, type);
-#endif
 
 	mutex_lock(&inet_diag_table_mutex);
 	if (!inet_diag_table[type])
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6a250828b76..4ec5b4e97c4 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -115,7 +115,7 @@ int tcp_set_default_congestion_control(const char *name)
 
 	spin_lock(&tcp_cong_list_lock);
 	ca = tcp_ca_find(name);
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	if (!ca && capable(CAP_SYS_MODULE)) {
 		spin_unlock(&tcp_cong_list_lock);
 
@@ -244,7 +244,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 	if (ca == icsk->icsk_ca_ops)
 		goto out;
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	/* not found attempt to autoload module */
 	if (!ca && capable(CAP_SYS_MODULE)) {
 		rcu_read_unlock();
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 08e82d64eb6..2e4ad9671e1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -822,7 +822,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 
 	parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
 	if (!parse_nat_setup) {
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 		rcu_read_unlock();
 		nfnl_unlock();
 		if (request_module("nf-nat-ipv4") < 0) {
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 4739f9f961d..9c0ba17a1dd 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -137,7 +137,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 replay:
 	ss = nfnetlink_get_subsys(type);
 	if (!ss) {
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 		nfnl_unlock();
 		request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
 		nfnl_lock();
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2fd8afac5f7..480184a857d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -435,7 +435,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol)
 		return -EPROTONOSUPPORT;
 
 	netlink_lock_table();
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	if (!nl_table[protocol].registered) {
 		netlink_unlock_table();
 		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 9e9c6fce11a..b9d97effebe 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -67,11 +67,10 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
 	}
 
 	pnp = phonet_proto_get(protocol);
-#ifdef CONFIG_KMOD
 	if (pnp == NULL &&
 	    request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0)
 		pnp = phonet_proto_get(protocol);
-#endif
+
 	if (pnp == NULL)
 		return -EPROTONOSUPPORT;
 	if (sock->type != pnp->sock_type) {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9974b3f04f0..8f457f1e0ac 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -494,7 +494,7 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
 
 	a_o = tc_lookup_action_n(act_name);
 	if (a_o == NULL) {
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 		rtnl_unlock();
 		request_module("act_%s", act_name);
 		rtnl_lock();
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8eb79e92e94..16e7ac9774e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -227,7 +227,7 @@ replay:
 		err = -ENOENT;
 		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
 		if (tp_ops == NULL) {
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 			struct nlattr *kind = tca[TCA_KIND];
 			char name[IFNAMSIZ];
 
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 5e6f82e0e6f..e82519e548d 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -224,7 +224,7 @@ static int tcf_em_validate(struct tcf_proto *tp,
 
 		if (em->ops == NULL) {
 			err = -ENOENT;
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 			__rtnl_unlock();
 			request_module("ematch-kind-%u", em_hdr->kind);
 			rtnl_lock();
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 1122c952aa9..b16ad2972c6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -764,7 +764,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 	struct qdisc_size_table *stab;
 
 	ops = qdisc_lookup_ops(kind);
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	if (ops == NULL && kind != NULL) {
 		char name[IFNAMSIZ];
 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
diff --git a/net/socket.c b/net/socket.c
index 3e8d4e35c08..2b7a4b5c9b7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1142,7 +1142,7 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
 
 	sock->type = type;
 
-#if defined(CONFIG_KMOD)
+#ifdef CONFIG_MODULES
 	/* Attempt to load a protocol module if the find failed.
 	 *
 	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 6bfea9ed686..436bf1b4b76 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -83,10 +83,8 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
 	if (flavor >= RPC_AUTH_MAXFLAVOR)
 		goto out;
 
-#ifdef CONFIG_KMOD
 	if ((ops = auth_flavors[flavor]) == NULL)
 		request_module("rpc-auth-%u", flavor);
-#endif
 	spin_lock(&rpc_authflavor_lock);
 	ops = auth_flavors[flavor];
 	if (ops == NULL || !try_module_get(ops->owner)) {
-- 
cgit v1.2.3